qemu

Форк
0
/
qcow2-cluster.c 
2562 строки · 86.9 Кб
1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24

25
#include "qemu/osdep.h"
26
#include <zlib.h>
27

28
#include "block/block-io.h"
29
#include "qapi/error.h"
30
#include "qcow2.h"
31
#include "qemu/bswap.h"
32
#include "qemu/memalign.h"
33
#include "trace.h"
34

35
int coroutine_fn qcow2_shrink_l1_table(BlockDriverState *bs,
36
                                       uint64_t exact_size)
37
{
38
    BDRVQcow2State *s = bs->opaque;
39
    int new_l1_size, i, ret;
40

41
    if (exact_size >= s->l1_size) {
42
        return 0;
43
    }
44

45
    new_l1_size = exact_size;
46

47
#ifdef DEBUG_ALLOC2
48
    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
49
#endif
50

51
    BLKDBG_CO_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
52
    ret = bdrv_co_pwrite_zeroes(bs->file,
53
                                s->l1_table_offset + new_l1_size * L1E_SIZE,
54
                                (s->l1_size - new_l1_size) * L1E_SIZE, 0);
55
    if (ret < 0) {
56
        goto fail;
57
    }
58

59
    ret = bdrv_co_flush(bs->file->bs);
60
    if (ret < 0) {
61
        goto fail;
62
    }
63

64
    BLKDBG_CO_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
65
    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
66
        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
67
            continue;
68
        }
69
        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
70
                            s->cluster_size, QCOW2_DISCARD_ALWAYS);
71
        s->l1_table[i] = 0;
72
    }
73
    return 0;
74

75
fail:
76
    /*
77
     * If the write in the l1_table failed the image may contain a partially
78
     * overwritten l1_table. In this case it would be better to clear the
79
     * l1_table in memory to avoid possible image corruption.
80
     */
81
    memset(s->l1_table + new_l1_size, 0,
82
           (s->l1_size - new_l1_size) * L1E_SIZE);
83
    return ret;
84
}
85

86
int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
87
                        bool exact_size)
88
{
89
    BDRVQcow2State *s = bs->opaque;
90
    int new_l1_size2, ret, i;
91
    uint64_t *new_l1_table;
92
    int64_t old_l1_table_offset, old_l1_size;
93
    int64_t new_l1_table_offset, new_l1_size;
94
    uint8_t data[12];
95

96
    if (min_size <= s->l1_size)
97
        return 0;
98

99
    /* Do a sanity check on min_size before trying to calculate new_l1_size
100
     * (this prevents overflows during the while loop for the calculation of
101
     * new_l1_size) */
102
    if (min_size > INT_MAX / L1E_SIZE) {
103
        return -EFBIG;
104
    }
105

106
    if (exact_size) {
107
        new_l1_size = min_size;
108
    } else {
109
        /* Bump size up to reduce the number of times we have to grow */
110
        new_l1_size = s->l1_size;
111
        if (new_l1_size == 0) {
112
            new_l1_size = 1;
113
        }
114
        while (min_size > new_l1_size) {
115
            new_l1_size = DIV_ROUND_UP(new_l1_size * 3, 2);
116
        }
117
    }
118

119
    QEMU_BUILD_BUG_ON(QCOW_MAX_L1_SIZE > INT_MAX);
120
    if (new_l1_size > QCOW_MAX_L1_SIZE / L1E_SIZE) {
121
        return -EFBIG;
122
    }
123

124
#ifdef DEBUG_ALLOC2
125
    fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n",
126
            s->l1_size, new_l1_size);
127
#endif
128

129
    new_l1_size2 = L1E_SIZE * new_l1_size;
130
    new_l1_table = qemu_try_blockalign(bs->file->bs, new_l1_size2);
131
    if (new_l1_table == NULL) {
132
        return -ENOMEM;
133
    }
134
    memset(new_l1_table, 0, new_l1_size2);
135

136
    if (s->l1_size) {
137
        memcpy(new_l1_table, s->l1_table, s->l1_size * L1E_SIZE);
138
    }
139

140
    /* write new table (align to cluster) */
141
    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE);
142
    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
143
    if (new_l1_table_offset < 0) {
144
        qemu_vfree(new_l1_table);
145
        return new_l1_table_offset;
146
    }
147

148
    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
149
    if (ret < 0) {
150
        goto fail;
151
    }
152

153
    /* the L1 position has not yet been updated, so these clusters must
154
     * indeed be completely free */
155
    ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
156
                                        new_l1_size2, false);
157
    if (ret < 0) {
158
        goto fail;
159
    }
160

161
    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
162
    for(i = 0; i < s->l1_size; i++)
163
        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
164
    ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_size2,
165
                           new_l1_table, 0);
166
    if (ret < 0)
167
        goto fail;
168
    for(i = 0; i < s->l1_size; i++)
169
        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
170

171
    /* set new table */
172
    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
173
    stl_be_p(data, new_l1_size);
174
    stq_be_p(data + 4, new_l1_table_offset);
175
    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size),
176
                           sizeof(data), data, 0);
177
    if (ret < 0) {
178
        goto fail;
179
    }
180
    qemu_vfree(s->l1_table);
181
    old_l1_table_offset = s->l1_table_offset;
182
    s->l1_table_offset = new_l1_table_offset;
183
    s->l1_table = new_l1_table;
184
    old_l1_size = s->l1_size;
185
    s->l1_size = new_l1_size;
186
    qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * L1E_SIZE,
187
                        QCOW2_DISCARD_OTHER);
188
    return 0;
189
 fail:
190
    qemu_vfree(new_l1_table);
191
    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
192
                        QCOW2_DISCARD_OTHER);
193
    return ret;
194
}
195

196
/*
197
 * l2_load
198
 *
199
 * @bs: The BlockDriverState
200
 * @offset: A guest offset, used to calculate what slice of the L2
201
 *          table to load.
202
 * @l2_offset: Offset to the L2 table in the image file.
203
 * @l2_slice: Location to store the pointer to the L2 slice.
204
 *
205
 * Loads a L2 slice into memory (L2 slices are the parts of L2 tables
206
 * that are loaded by the qcow2 cache). If the slice is in the cache,
207
 * the cache is used; otherwise the L2 slice is loaded from the image
208
 * file.
209
 */
210
static int GRAPH_RDLOCK
211
l2_load(BlockDriverState *bs, uint64_t offset,
212
        uint64_t l2_offset, uint64_t **l2_slice)
213
{
214
    BDRVQcow2State *s = bs->opaque;
215
    int start_of_slice = l2_entry_size(s) *
216
        (offset_to_l2_index(s, offset) - offset_to_l2_slice_index(s, offset));
217

218
    return qcow2_cache_get(bs, s->l2_table_cache, l2_offset + start_of_slice,
219
                           (void **)l2_slice);
220
}
221

222
/*
223
 * Writes an L1 entry to disk (note that depending on the alignment
224
 * requirements this function may write more that just one entry in
225
 * order to prevent bdrv_pwrite from performing a read-modify-write)
226
 */
227
int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
228
{
229
    BDRVQcow2State *s = bs->opaque;
230
    int l1_start_index;
231
    int i, ret;
232
    int bufsize = MAX(L1E_SIZE,
233
                      MIN(bs->file->bs->bl.request_alignment, s->cluster_size));
234
    int nentries = bufsize / L1E_SIZE;
235
    g_autofree uint64_t *buf = g_try_new0(uint64_t, nentries);
236

237
    if (buf == NULL) {
238
        return -ENOMEM;
239
    }
240

241
    l1_start_index = QEMU_ALIGN_DOWN(l1_index, nentries);
242
    for (i = 0; i < MIN(nentries, s->l1_size - l1_start_index); i++) {
243
        buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
244
    }
245

246
    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1,
247
            s->l1_table_offset + L1E_SIZE * l1_start_index, bufsize, false);
248
    if (ret < 0) {
249
        return ret;
250
    }
251

252
    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
253
    ret = bdrv_pwrite_sync(bs->file,
254
                           s->l1_table_offset + L1E_SIZE * l1_start_index,
255
                           bufsize, buf, 0);
256
    if (ret < 0) {
257
        return ret;
258
    }
259

260
    return 0;
261
}
262

263
/*
264
 * l2_allocate
265
 *
266
 * Allocate a new l2 entry in the file. If l1_index points to an already
267
 * used entry in the L2 table (i.e. we are doing a copy on write for the L2
268
 * table) copy the contents of the old L2 table into the newly allocated one.
269
 * Otherwise the new table is initialized with zeros.
270
 *
271
 */
272

273
static int GRAPH_RDLOCK l2_allocate(BlockDriverState *bs, int l1_index)
274
{
275
    BDRVQcow2State *s = bs->opaque;
276
    uint64_t old_l2_offset;
277
    uint64_t *l2_slice = NULL;
278
    unsigned slice, slice_size2, n_slices;
279
    int64_t l2_offset;
280
    int ret;
281

282
    old_l2_offset = s->l1_table[l1_index];
283

284
    trace_qcow2_l2_allocate(bs, l1_index);
285

286
    /* allocate a new l2 entry */
287

288
    l2_offset = qcow2_alloc_clusters(bs, s->l2_size * l2_entry_size(s));
289
    if (l2_offset < 0) {
290
        ret = l2_offset;
291
        goto fail;
292
    }
293

294
    /* The offset must fit in the offset field of the L1 table entry */
295
    assert((l2_offset & L1E_OFFSET_MASK) == l2_offset);
296

297
    /* If we're allocating the table at offset 0 then something is wrong */
298
    if (l2_offset == 0) {
299
        qcow2_signal_corruption(bs, true, -1, -1, "Preventing invalid "
300
                                "allocation of L2 table at offset 0");
301
        ret = -EIO;
302
        goto fail;
303
    }
304

305
    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
306
    if (ret < 0) {
307
        goto fail;
308
    }
309

310
    /* allocate a new entry in the l2 cache */
311

312
    slice_size2 = s->l2_slice_size * l2_entry_size(s);
313
    n_slices = s->cluster_size / slice_size2;
314

315
    trace_qcow2_l2_allocate_get_empty(bs, l1_index);
316
    for (slice = 0; slice < n_slices; slice++) {
317
        ret = qcow2_cache_get_empty(bs, s->l2_table_cache,
318
                                    l2_offset + slice * slice_size2,
319
                                    (void **) &l2_slice);
320
        if (ret < 0) {
321
            goto fail;
322
        }
323

324
        if ((old_l2_offset & L1E_OFFSET_MASK) == 0) {
325
            /* if there was no old l2 table, clear the new slice */
326
            memset(l2_slice, 0, slice_size2);
327
        } else {
328
            uint64_t *old_slice;
329
            uint64_t old_l2_slice_offset =
330
                (old_l2_offset & L1E_OFFSET_MASK) + slice * slice_size2;
331

332
            /* if there was an old l2 table, read a slice from the disk */
333
            BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ);
334
            ret = qcow2_cache_get(bs, s->l2_table_cache, old_l2_slice_offset,
335
                                  (void **) &old_slice);
336
            if (ret < 0) {
337
                goto fail;
338
            }
339

340
            memcpy(l2_slice, old_slice, slice_size2);
341

342
            qcow2_cache_put(s->l2_table_cache, (void **) &old_slice);
343
        }
344

345
        /* write the l2 slice to the file */
346
        BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE);
347

348
        trace_qcow2_l2_allocate_write_l2(bs, l1_index);
349
        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
350
        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
351
    }
352

353
    ret = qcow2_cache_flush(bs, s->l2_table_cache);
354
    if (ret < 0) {
355
        goto fail;
356
    }
357

358
    /* update the L1 entry */
359
    trace_qcow2_l2_allocate_write_l1(bs, l1_index);
360
    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
361
    ret = qcow2_write_l1_entry(bs, l1_index);
362
    if (ret < 0) {
363
        goto fail;
364
    }
365

366
    trace_qcow2_l2_allocate_done(bs, l1_index, 0);
367
    return 0;
368

369
fail:
370
    trace_qcow2_l2_allocate_done(bs, l1_index, ret);
371
    if (l2_slice != NULL) {
372
        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
373
    }
374
    s->l1_table[l1_index] = old_l2_offset;
375
    if (l2_offset > 0) {
376
        qcow2_free_clusters(bs, l2_offset, s->l2_size * l2_entry_size(s),
377
                            QCOW2_DISCARD_ALWAYS);
378
    }
379
    return ret;
380
}
381

382
/*
383
 * For a given L2 entry, count the number of contiguous subclusters of
384
 * the same type starting from @sc_from. Compressed clusters are
385
 * treated as if they were divided into subclusters of size
386
 * s->subcluster_size.
387
 *
388
 * Return the number of contiguous subclusters and set @type to the
389
 * subcluster type.
390
 *
391
 * If the L2 entry is invalid return -errno and set @type to
392
 * QCOW2_SUBCLUSTER_INVALID.
393
 */
394
static int GRAPH_RDLOCK
395
qcow2_get_subcluster_range_type(BlockDriverState *bs, uint64_t l2_entry,
396
                                uint64_t l2_bitmap, unsigned sc_from,
397
                                QCow2SubclusterType *type)
398
{
399
    BDRVQcow2State *s = bs->opaque;
400
    uint32_t val;
401

402
    *type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_from);
403

404
    if (*type == QCOW2_SUBCLUSTER_INVALID) {
405
        return -EINVAL;
406
    } else if (!has_subclusters(s) || *type == QCOW2_SUBCLUSTER_COMPRESSED) {
407
        return s->subclusters_per_cluster - sc_from;
408
    }
409

410
    switch (*type) {
411
    case QCOW2_SUBCLUSTER_NORMAL:
412
        val = l2_bitmap | QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from);
413
        return cto32(val) - sc_from;
414

415
    case QCOW2_SUBCLUSTER_ZERO_PLAIN:
416
    case QCOW2_SUBCLUSTER_ZERO_ALLOC:
417
        val = (l2_bitmap | QCOW_OFLAG_SUB_ZERO_RANGE(0, sc_from)) >> 32;
418
        return cto32(val) - sc_from;
419

420
    case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
421
    case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
422
        val = ((l2_bitmap >> 32) | l2_bitmap)
423
            & ~QCOW_OFLAG_SUB_ALLOC_RANGE(0, sc_from);
424
        return ctz32(val) - sc_from;
425

426
    default:
427
        g_assert_not_reached();
428
    }
429
}
430

431
/*
432
 * Return the number of contiguous subclusters of the exact same type
433
 * in a given L2 slice, starting from cluster @l2_index, subcluster
434
 * @sc_index. Allocated subclusters are required to be contiguous in
435
 * the image file.
436
 * At most @nb_clusters are checked (note that this means clusters,
437
 * not subclusters).
438
 * Compressed clusters are always processed one by one but for the
439
 * purpose of this count they are treated as if they were divided into
440
 * subclusters of size s->subcluster_size.
441
 * On failure return -errno and update @l2_index to point to the
442
 * invalid entry.
443
 */
444
static int GRAPH_RDLOCK
445
count_contiguous_subclusters(BlockDriverState *bs, int nb_clusters,
446
                             unsigned sc_index, uint64_t *l2_slice,
447
                             unsigned *l2_index)
448
{
449
    BDRVQcow2State *s = bs->opaque;
450
    int i, count = 0;
451
    bool check_offset = false;
452
    uint64_t expected_offset = 0;
453
    QCow2SubclusterType expected_type = QCOW2_SUBCLUSTER_NORMAL, type;
454

455
    assert(*l2_index + nb_clusters <= s->l2_slice_size);
456

457
    for (i = 0; i < nb_clusters; i++) {
458
        unsigned first_sc = (i == 0) ? sc_index : 0;
459
        uint64_t l2_entry = get_l2_entry(s, l2_slice, *l2_index + i);
460
        uint64_t l2_bitmap = get_l2_bitmap(s, l2_slice, *l2_index + i);
461
        int ret = qcow2_get_subcluster_range_type(bs, l2_entry, l2_bitmap,
462
                                                  first_sc, &type);
463
        if (ret < 0) {
464
            *l2_index += i; /* Point to the invalid entry */
465
            return -EIO;
466
        }
467
        if (i == 0) {
468
            if (type == QCOW2_SUBCLUSTER_COMPRESSED) {
469
                /* Compressed clusters are always processed one by one */
470
                return ret;
471
            }
472
            expected_type = type;
473
            expected_offset = l2_entry & L2E_OFFSET_MASK;
474
            check_offset = (type == QCOW2_SUBCLUSTER_NORMAL ||
475
                            type == QCOW2_SUBCLUSTER_ZERO_ALLOC ||
476
                            type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC);
477
        } else if (type != expected_type) {
478
            break;
479
        } else if (check_offset) {
480
            expected_offset += s->cluster_size;
481
            if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) {
482
                break;
483
            }
484
        }
485
        count += ret;
486
        /* Stop if there are type changes before the end of the cluster */
487
        if (first_sc + ret < s->subclusters_per_cluster) {
488
            break;
489
        }
490
    }
491

492
    return count;
493
}
494

495
static int coroutine_fn GRAPH_RDLOCK
496
do_perform_cow_read(BlockDriverState *bs, uint64_t src_cluster_offset,
497
                    unsigned offset_in_cluster, QEMUIOVector *qiov)
498
{
499
    int ret;
500

501
    if (qiov->size == 0) {
502
        return 0;
503
    }
504

505
    BLKDBG_CO_EVENT(bs->file, BLKDBG_COW_READ);
506

507
    if (!bs->drv) {
508
        return -ENOMEDIUM;
509
    }
510

511
    /*
512
     * We never deal with requests that don't satisfy
513
     * bdrv_check_qiov_request(), and aligning requests to clusters never
514
     * breaks this condition. So, do some assertions before calling
515
     * bs->drv->bdrv_co_preadv_part() which has int64_t arguments.
516
     */
517
    assert(src_cluster_offset <= INT64_MAX);
518
    assert(src_cluster_offset + offset_in_cluster <= INT64_MAX);
519
    /* Cast qiov->size to uint64_t to silence a compiler warning on -m32 */
520
    assert((uint64_t)qiov->size <= INT64_MAX);
521
    bdrv_check_qiov_request(src_cluster_offset + offset_in_cluster, qiov->size,
522
                            qiov, 0, &error_abort);
523
    /*
524
     * Call .bdrv_co_readv() directly instead of using the public block-layer
525
     * interface.  This avoids double I/O throttling and request tracking,
526
     * which can lead to deadlock when block layer copy-on-read is enabled.
527
     */
528
    ret = bs->drv->bdrv_co_preadv_part(bs,
529
                                       src_cluster_offset + offset_in_cluster,
530
                                       qiov->size, qiov, 0, 0);
531
    if (ret < 0) {
532
        return ret;
533
    }
534

535
    return 0;
536
}
537

538
static int coroutine_fn GRAPH_RDLOCK
539
do_perform_cow_write(BlockDriverState *bs, uint64_t cluster_offset,
540
                     unsigned offset_in_cluster, QEMUIOVector *qiov)
541
{
542
    BDRVQcow2State *s = bs->opaque;
543
    int ret;
544

545
    if (qiov->size == 0) {
546
        return 0;
547
    }
548

549
    ret = qcow2_pre_write_overlap_check(bs, 0,
550
            cluster_offset + offset_in_cluster, qiov->size, true);
551
    if (ret < 0) {
552
        return ret;
553
    }
554

555
    BLKDBG_CO_EVENT(bs->file, BLKDBG_COW_WRITE);
556
    ret = bdrv_co_pwritev(s->data_file, cluster_offset + offset_in_cluster,
557
                          qiov->size, qiov, 0);
558
    if (ret < 0) {
559
        return ret;
560
    }
561

562
    return 0;
563
}
564

565

566
/*
567
 * get_host_offset
568
 *
569
 * For a given offset of the virtual disk find the equivalent host
570
 * offset in the qcow2 file and store it in *host_offset. Neither
571
 * offset needs to be aligned to a cluster boundary.
572
 *
573
 * If the cluster is unallocated then *host_offset will be 0.
574
 * If the cluster is compressed then *host_offset will contain the l2 entry.
575
 *
576
 * On entry, *bytes is the maximum number of contiguous bytes starting at
577
 * offset that we are interested in.
578
 *
579
 * On exit, *bytes is the number of bytes starting at offset that have the same
580
 * subcluster type and (if applicable) are stored contiguously in the image
581
 * file. The subcluster type is stored in *subcluster_type.
582
 * Compressed clusters are always processed one by one.
583
 *
584
 * Returns 0 on success, -errno in error cases.
585
 */
586
int qcow2_get_host_offset(BlockDriverState *bs, uint64_t offset,
587
                          unsigned int *bytes, uint64_t *host_offset,
588
                          QCow2SubclusterType *subcluster_type)
589
{
590
    BDRVQcow2State *s = bs->opaque;
591
    unsigned int l2_index, sc_index;
592
    uint64_t l1_index, l2_offset, *l2_slice, l2_entry, l2_bitmap;
593
    int sc;
594
    unsigned int offset_in_cluster;
595
    uint64_t bytes_available, bytes_needed, nb_clusters;
596
    QCow2SubclusterType type;
597
    int ret;
598

599
    offset_in_cluster = offset_into_cluster(s, offset);
600
    bytes_needed = (uint64_t) *bytes + offset_in_cluster;
601

602
    /* compute how many bytes there are between the start of the cluster
603
     * containing offset and the end of the l2 slice that contains
604
     * the entry pointing to it */
605
    bytes_available =
606
        ((uint64_t) (s->l2_slice_size - offset_to_l2_slice_index(s, offset)))
607
        << s->cluster_bits;
608

609
    if (bytes_needed > bytes_available) {
610
        bytes_needed = bytes_available;
611
    }
612

613
    *host_offset = 0;
614

615
    /* seek to the l2 offset in the l1 table */
616

617
    l1_index = offset_to_l1_index(s, offset);
618
    if (l1_index >= s->l1_size) {
619
        type = QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN;
620
        goto out;
621
    }
622

623
    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
624
    if (!l2_offset) {
625
        type = QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN;
626
        goto out;
627
    }
628

629
    if (offset_into_cluster(s, l2_offset)) {
630
        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
631
                                " unaligned (L1 index: %#" PRIx64 ")",
632
                                l2_offset, l1_index);
633
        return -EIO;
634
    }
635

636
    /* load the l2 slice in memory */
637

638
    ret = l2_load(bs, offset, l2_offset, &l2_slice);
639
    if (ret < 0) {
640
        return ret;
641
    }
642

643
    /* find the cluster offset for the given disk offset */
644

645
    l2_index = offset_to_l2_slice_index(s, offset);
646
    sc_index = offset_to_sc_index(s, offset);
647
    l2_entry = get_l2_entry(s, l2_slice, l2_index);
648
    l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
649

650
    nb_clusters = size_to_clusters(s, bytes_needed);
651
    /* bytes_needed <= *bytes + offset_in_cluster, both of which are unsigned
652
     * integers; the minimum cluster size is 512, so this assertion is always
653
     * true */
654
    assert(nb_clusters <= INT_MAX);
655

656
    type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
657
    if (s->qcow_version < 3 && (type == QCOW2_SUBCLUSTER_ZERO_PLAIN ||
658
                                type == QCOW2_SUBCLUSTER_ZERO_ALLOC)) {
659
        qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
660
                                " in pre-v3 image (L2 offset: %#" PRIx64
661
                                ", L2 index: %#x)", l2_offset, l2_index);
662
        ret = -EIO;
663
        goto fail;
664
    }
665
    switch (type) {
666
    case QCOW2_SUBCLUSTER_INVALID:
667
        break; /* This is handled by count_contiguous_subclusters() below */
668
    case QCOW2_SUBCLUSTER_COMPRESSED:
669
        if (has_data_file(bs)) {
670
            qcow2_signal_corruption(bs, true, -1, -1, "Compressed cluster "
671
                                    "entry found in image with external data "
672
                                    "file (L2 offset: %#" PRIx64 ", L2 index: "
673
                                    "%#x)", l2_offset, l2_index);
674
            ret = -EIO;
675
            goto fail;
676
        }
677
        *host_offset = l2_entry;
678
        break;
679
    case QCOW2_SUBCLUSTER_ZERO_PLAIN:
680
    case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
681
        break;
682
    case QCOW2_SUBCLUSTER_ZERO_ALLOC:
683
    case QCOW2_SUBCLUSTER_NORMAL:
684
    case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC: {
685
        uint64_t host_cluster_offset = l2_entry & L2E_OFFSET_MASK;
686
        *host_offset = host_cluster_offset + offset_in_cluster;
687
        if (offset_into_cluster(s, host_cluster_offset)) {
688
            qcow2_signal_corruption(bs, true, -1, -1,
689
                                    "Cluster allocation offset %#"
690
                                    PRIx64 " unaligned (L2 offset: %#" PRIx64
691
                                    ", L2 index: %#x)", host_cluster_offset,
692
                                    l2_offset, l2_index);
693
            ret = -EIO;
694
            goto fail;
695
        }
696
        if (has_data_file(bs) && *host_offset != offset) {
697
            qcow2_signal_corruption(bs, true, -1, -1,
698
                                    "External data file host cluster offset %#"
699
                                    PRIx64 " does not match guest cluster "
700
                                    "offset: %#" PRIx64
701
                                    ", L2 index: %#x)", host_cluster_offset,
702
                                    offset - offset_in_cluster, l2_index);
703
            ret = -EIO;
704
            goto fail;
705
        }
706
        break;
707
    }
708
    default:
709
        abort();
710
    }
711

712
    sc = count_contiguous_subclusters(bs, nb_clusters, sc_index,
713
                                      l2_slice, &l2_index);
714
    if (sc < 0) {
715
        qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster entry found "
716
                                " (L2 offset: %#" PRIx64 ", L2 index: %#x)",
717
                                l2_offset, l2_index);
718
        ret = -EIO;
719
        goto fail;
720
    }
721
    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
722

723
    bytes_available = ((int64_t)sc + sc_index) << s->subcluster_bits;
724

725
out:
726
    if (bytes_available > bytes_needed) {
727
        bytes_available = bytes_needed;
728
    }
729

730
    /* bytes_available <= bytes_needed <= *bytes + offset_in_cluster;
731
     * subtracting offset_in_cluster will therefore definitely yield something
732
     * not exceeding UINT_MAX */
733
    assert(bytes_available - offset_in_cluster <= UINT_MAX);
734
    *bytes = bytes_available - offset_in_cluster;
735

736
    *subcluster_type = type;
737

738
    return 0;
739

740
fail:
741
    qcow2_cache_put(s->l2_table_cache, (void **)&l2_slice);
742
    return ret;
743
}
744

745
/*
746
 * get_cluster_table
747
 *
748
 * for a given disk offset, load (and allocate if needed)
749
 * the appropriate slice of its l2 table.
750
 *
751
 * the cluster index in the l2 slice is given to the caller.
752
 *
753
 * Returns 0 on success, -errno in failure case
754
 */
755
static int GRAPH_RDLOCK
756
get_cluster_table(BlockDriverState *bs, uint64_t offset,
757
                  uint64_t **new_l2_slice, int *new_l2_index)
758
{
759
    BDRVQcow2State *s = bs->opaque;
760
    unsigned int l2_index;
761
    uint64_t l1_index, l2_offset;
762
    uint64_t *l2_slice = NULL;
763
    int ret;
764

765
    /* seek to the l2 offset in the l1 table */
766

767
    l1_index = offset_to_l1_index(s, offset);
768
    if (l1_index >= s->l1_size) {
769
        ret = qcow2_grow_l1_table(bs, l1_index + 1, false);
770
        if (ret < 0) {
771
            return ret;
772
        }
773
    }
774

775
    assert(l1_index < s->l1_size);
776
    l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
777
    if (offset_into_cluster(s, l2_offset)) {
778
        qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64
779
                                " unaligned (L1 index: %#" PRIx64 ")",
780
                                l2_offset, l1_index);
781
        return -EIO;
782
    }
783

784
    if (!(s->l1_table[l1_index] & QCOW_OFLAG_COPIED)) {
785
        /* First allocate a new L2 table (and do COW if needed) */
786
        ret = l2_allocate(bs, l1_index);
787
        if (ret < 0) {
788
            return ret;
789
        }
790

791
        /* Then decrease the refcount of the old table */
792
        if (l2_offset) {
793
            qcow2_free_clusters(bs, l2_offset, s->l2_size * l2_entry_size(s),
794
                                QCOW2_DISCARD_OTHER);
795
        }
796

797
        /* Get the offset of the newly-allocated l2 table */
798
        l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
799
        assert(offset_into_cluster(s, l2_offset) == 0);
800
    }
801

802
    /* load the l2 slice in memory */
803
    ret = l2_load(bs, offset, l2_offset, &l2_slice);
804
    if (ret < 0) {
805
        return ret;
806
    }
807

808
    /* find the cluster offset for the given disk offset */
809

810
    l2_index = offset_to_l2_slice_index(s, offset);
811

812
    *new_l2_slice = l2_slice;
813
    *new_l2_index = l2_index;
814

815
    return 0;
816
}
817

818
/*
819
 * alloc_compressed_cluster_offset
820
 *
821
 * For a given offset on the virtual disk, allocate a new compressed cluster
822
 * and put the host offset of the cluster into *host_offset. If a cluster is
823
 * already allocated at the offset, return an error.
824
 *
825
 * Return 0 on success and -errno in error cases
826
 */
827
int coroutine_fn GRAPH_RDLOCK
828
qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset,
829
                                      int compressed_size, uint64_t *host_offset)
830
{
831
    BDRVQcow2State *s = bs->opaque;
832
    int l2_index, ret;
833
    uint64_t *l2_slice;
834
    int64_t cluster_offset;
835
    int nb_csectors;
836

837
    if (has_data_file(bs)) {
838
        return 0;
839
    }
840

841
    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
842
    if (ret < 0) {
843
        return ret;
844
    }
845

846
    /* Compression can't overwrite anything. Fail if the cluster was already
847
     * allocated. */
848
    cluster_offset = get_l2_entry(s, l2_slice, l2_index);
849
    if (cluster_offset & L2E_OFFSET_MASK) {
850
        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
851
        return -EIO;
852
    }
853

854
    cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
855
    if (cluster_offset < 0) {
856
        qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
857
        return cluster_offset;
858
    }
859

860
    nb_csectors =
861
        (cluster_offset + compressed_size - 1) / QCOW2_COMPRESSED_SECTOR_SIZE -
862
        (cluster_offset / QCOW2_COMPRESSED_SECTOR_SIZE);
863

864
    /* The offset and size must fit in their fields of the L2 table entry */
865
    assert((cluster_offset & s->cluster_offset_mask) == cluster_offset);
866
    assert((nb_csectors & s->csize_mask) == nb_csectors);
867

868
    cluster_offset |= QCOW_OFLAG_COMPRESSED |
869
                      ((uint64_t)nb_csectors << s->csize_shift);
870

871
    /* update L2 table */
872

873
    /* compressed clusters never have the copied flag */
874

875
    BLKDBG_CO_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED);
876
    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
877
    set_l2_entry(s, l2_slice, l2_index, cluster_offset);
878
    if (has_subclusters(s)) {
879
        set_l2_bitmap(s, l2_slice, l2_index, 0);
880
    }
881
    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
882

883
    *host_offset = cluster_offset & s->cluster_offset_mask;
884
    return 0;
885
}
886

887
static int coroutine_fn GRAPH_RDLOCK
888
perform_cow(BlockDriverState *bs, QCowL2Meta *m)
889
{
890
    BDRVQcow2State *s = bs->opaque;
891
    Qcow2COWRegion *start = &m->cow_start;
892
    Qcow2COWRegion *end = &m->cow_end;
893
    unsigned buffer_size;
894
    unsigned data_bytes = end->offset - (start->offset + start->nb_bytes);
895
    bool merge_reads;
896
    uint8_t *start_buffer, *end_buffer;
897
    QEMUIOVector qiov;
898
    int ret;
899

900
    assert(start->nb_bytes <= UINT_MAX - end->nb_bytes);
901
    assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes);
902
    assert(start->offset + start->nb_bytes <= end->offset);
903

904
    if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) {
905
        return 0;
906
    }
907

908
    /* If we have to read both the start and end COW regions and the
909
     * middle region is not too large then perform just one read
910
     * operation */
911
    merge_reads = start->nb_bytes && end->nb_bytes && data_bytes <= 16384;
912
    if (merge_reads) {
913
        buffer_size = start->nb_bytes + data_bytes + end->nb_bytes;
914
    } else {
915
        /* If we have to do two reads, add some padding in the middle
916
         * if necessary to make sure that the end region is optimally
917
         * aligned. */
918
        size_t align = bdrv_opt_mem_align(bs);
919
        assert(align > 0 && align <= UINT_MAX);
920
        assert(QEMU_ALIGN_UP(start->nb_bytes, align) <=
921
               UINT_MAX - end->nb_bytes);
922
        buffer_size = QEMU_ALIGN_UP(start->nb_bytes, align) + end->nb_bytes;
923
    }
924

925
    /* Reserve a buffer large enough to store all the data that we're
926
     * going to read */
927
    start_buffer = qemu_try_blockalign(bs, buffer_size);
928
    if (start_buffer == NULL) {
929
        return -ENOMEM;
930
    }
931
    /* The part of the buffer where the end region is located */
932
    end_buffer = start_buffer + buffer_size - end->nb_bytes;
933

934
    qemu_iovec_init(&qiov, 2 + (m->data_qiov ?
935
                                qemu_iovec_subvec_niov(m->data_qiov,
936
                                                       m->data_qiov_offset,
937
                                                       data_bytes)
938
                                : 0));
939

940
    qemu_co_mutex_unlock(&s->lock);
941
    /* First we read the existing data from both COW regions. We
942
     * either read the whole region in one go, or the start and end
943
     * regions separately. */
944
    if (merge_reads) {
945
        qemu_iovec_add(&qiov, start_buffer, buffer_size);
946
        ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
947
    } else {
948
        qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
949
        ret = do_perform_cow_read(bs, m->offset, start->offset, &qiov);
950
        if (ret < 0) {
951
            goto fail;
952
        }
953

954
        qemu_iovec_reset(&qiov);
955
        qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
956
        ret = do_perform_cow_read(bs, m->offset, end->offset, &qiov);
957
    }
958
    if (ret < 0) {
959
        goto fail;
960
    }
961

962
    /* Encrypt the data if necessary before writing it */
963
    if (bs->encrypted) {
964
        ret = qcow2_co_encrypt(bs,
965
                               m->alloc_offset + start->offset,
966
                               m->offset + start->offset,
967
                               start_buffer, start->nb_bytes);
968
        if (ret < 0) {
969
            goto fail;
970
        }
971

972
        ret = qcow2_co_encrypt(bs,
973
                               m->alloc_offset + end->offset,
974
                               m->offset + end->offset,
975
                               end_buffer, end->nb_bytes);
976
        if (ret < 0) {
977
            goto fail;
978
        }
979
    }
980

981
    /* And now we can write everything. If we have the guest data we
982
     * can write everything in one single operation */
983
    if (m->data_qiov) {
984
        qemu_iovec_reset(&qiov);
985
        if (start->nb_bytes) {
986
            qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
987
        }
988
        qemu_iovec_concat(&qiov, m->data_qiov, m->data_qiov_offset, data_bytes);
989
        if (end->nb_bytes) {
990
            qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
991
        }
992
        /* NOTE: we have a write_aio blkdebug event here followed by
993
         * a cow_write one in do_perform_cow_write(), but there's only
994
         * one single I/O operation */
995
        BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO);
996
        ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
997
    } else {
998
        /* If there's no guest data then write both COW regions separately */
999
        qemu_iovec_reset(&qiov);
1000
        qemu_iovec_add(&qiov, start_buffer, start->nb_bytes);
1001
        ret = do_perform_cow_write(bs, m->alloc_offset, start->offset, &qiov);
1002
        if (ret < 0) {
1003
            goto fail;
1004
        }
1005

1006
        qemu_iovec_reset(&qiov);
1007
        qemu_iovec_add(&qiov, end_buffer, end->nb_bytes);
1008
        ret = do_perform_cow_write(bs, m->alloc_offset, end->offset, &qiov);
1009
    }
1010

1011
fail:
1012
    qemu_co_mutex_lock(&s->lock);
1013

1014
    /*
1015
     * Before we update the L2 table to actually point to the new cluster, we
1016
     * need to be sure that the refcounts have been increased and COW was
1017
     * handled.
1018
     */
1019
    if (ret == 0) {
1020
        qcow2_cache_depends_on_flush(s->l2_table_cache);
1021
    }
1022

1023
    qemu_vfree(start_buffer);
1024
    qemu_iovec_destroy(&qiov);
1025
    return ret;
1026
}
1027

1028
int coroutine_fn qcow2_alloc_cluster_link_l2(BlockDriverState *bs,
1029
                                             QCowL2Meta *m)
1030
{
1031
    BDRVQcow2State *s = bs->opaque;
1032
    int i, j = 0, l2_index, ret;
1033
    uint64_t *old_cluster, *l2_slice;
1034
    uint64_t cluster_offset = m->alloc_offset;
1035

1036
    trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
1037
    assert(m->nb_clusters > 0);
1038

1039
    old_cluster = g_try_new(uint64_t, m->nb_clusters);
1040
    if (old_cluster == NULL) {
1041
        ret = -ENOMEM;
1042
        goto err;
1043
    }
1044

1045
    /* copy content of unmodified sectors */
1046
    ret = perform_cow(bs, m);
1047
    if (ret < 0) {
1048
        goto err;
1049
    }
1050

1051
    /* Update L2 table. */
1052
    if (s->use_lazy_refcounts) {
1053
        qcow2_mark_dirty(bs);
1054
    }
1055
    if (qcow2_need_accurate_refcounts(s)) {
1056
        qcow2_cache_set_dependency(bs, s->l2_table_cache,
1057
                                   s->refcount_block_cache);
1058
    }
1059

1060
    ret = get_cluster_table(bs, m->offset, &l2_slice, &l2_index);
1061
    if (ret < 0) {
1062
        goto err;
1063
    }
1064
    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
1065

1066
    assert(l2_index + m->nb_clusters <= s->l2_slice_size);
1067
    assert(m->cow_end.offset + m->cow_end.nb_bytes <=
1068
           m->nb_clusters << s->cluster_bits);
1069
    for (i = 0; i < m->nb_clusters; i++) {
1070
        uint64_t offset = cluster_offset + ((uint64_t)i << s->cluster_bits);
1071
        /* if two concurrent writes happen to the same unallocated cluster
1072
         * each write allocates separate cluster and writes data concurrently.
1073
         * The first one to complete updates l2 table with pointer to its
1074
         * cluster the second one has to do RMW (which is done above by
1075
         * perform_cow()), update l2 table with its cluster pointer and free
1076
         * old cluster. This is what this loop does */
1077
        if (get_l2_entry(s, l2_slice, l2_index + i) != 0) {
1078
            old_cluster[j++] = get_l2_entry(s, l2_slice, l2_index + i);
1079
        }
1080

1081
        /* The offset must fit in the offset field of the L2 table entry */
1082
        assert((offset & L2E_OFFSET_MASK) == offset);
1083

1084
        set_l2_entry(s, l2_slice, l2_index + i, offset | QCOW_OFLAG_COPIED);
1085

1086
        /* Update bitmap with the subclusters that were just written */
1087
        if (has_subclusters(s) && !m->prealloc) {
1088
            uint64_t l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
1089
            unsigned written_from = m->cow_start.offset;
1090
            unsigned written_to = m->cow_end.offset + m->cow_end.nb_bytes;
1091
            int first_sc, last_sc;
1092
            /* Narrow written_from and written_to down to the current cluster */
1093
            written_from = MAX(written_from, i << s->cluster_bits);
1094
            written_to   = MIN(written_to, (i + 1) << s->cluster_bits);
1095
            assert(written_from < written_to);
1096
            first_sc = offset_to_sc_index(s, written_from);
1097
            last_sc  = offset_to_sc_index(s, written_to - 1);
1098
            l2_bitmap |= QCOW_OFLAG_SUB_ALLOC_RANGE(first_sc, last_sc + 1);
1099
            l2_bitmap &= ~QCOW_OFLAG_SUB_ZERO_RANGE(first_sc, last_sc + 1);
1100
            set_l2_bitmap(s, l2_slice, l2_index + i, l2_bitmap);
1101
        }
1102
     }
1103

1104

1105
    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1106

1107
    /*
1108
     * If this was a COW, we need to decrease the refcount of the old cluster.
1109
     *
1110
     * Don't discard clusters that reach a refcount of 0 (e.g. compressed
1111
     * clusters), the next write will reuse them anyway.
1112
     */
1113
    if (!m->keep_old_clusters && j != 0) {
1114
        for (i = 0; i < j; i++) {
1115
            qcow2_free_any_cluster(bs, old_cluster[i], QCOW2_DISCARD_NEVER);
1116
        }
1117
    }
1118

1119
    ret = 0;
1120
err:
1121
    g_free(old_cluster);
1122
    return ret;
1123
 }
1124

1125
/**
1126
 * Frees the allocated clusters because the request failed and they won't
1127
 * actually be linked.
1128
 */
1129
void coroutine_fn qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m)
1130
{
1131
    BDRVQcow2State *s = bs->opaque;
1132
    if (!has_data_file(bs) && !m->keep_old_clusters) {
1133
        qcow2_free_clusters(bs, m->alloc_offset,
1134
                            m->nb_clusters << s->cluster_bits,
1135
                            QCOW2_DISCARD_NEVER);
1136
    }
1137
}
1138

1139
/*
1140
 * For a given write request, create a new QCowL2Meta structure, add
1141
 * it to @m and the BDRVQcow2State.cluster_allocs list. If the write
1142
 * request does not need copy-on-write or changes to the L2 metadata
1143
 * then this function does nothing.
1144
 *
1145
 * @host_cluster_offset points to the beginning of the first cluster.
1146
 *
1147
 * @guest_offset and @bytes indicate the offset and length of the
1148
 * request.
1149
 *
1150
 * @l2_slice contains the L2 entries of all clusters involved in this
1151
 * write request.
1152
 *
1153
 * If @keep_old is true it means that the clusters were already
1154
 * allocated and will be overwritten. If false then the clusters are
1155
 * new and we have to decrease the reference count of the old ones.
1156
 *
1157
 * Returns 0 on success, -errno on failure.
1158
 */
1159
static int coroutine_fn GRAPH_RDLOCK
1160
calculate_l2_meta(BlockDriverState *bs, uint64_t host_cluster_offset,
1161
                  uint64_t guest_offset, unsigned bytes, uint64_t *l2_slice,
1162
                  QCowL2Meta **m, bool keep_old)
1163
{
1164
    BDRVQcow2State *s = bs->opaque;
1165
    int sc_index, l2_index = offset_to_l2_slice_index(s, guest_offset);
1166
    uint64_t l2_entry, l2_bitmap;
1167
    unsigned cow_start_from, cow_end_to;
1168
    unsigned cow_start_to = offset_into_cluster(s, guest_offset);
1169
    unsigned cow_end_from = cow_start_to + bytes;
1170
    unsigned nb_clusters = size_to_clusters(s, cow_end_from);
1171
    QCowL2Meta *old_m = *m;
1172
    QCow2SubclusterType type;
1173
    int i;
1174
    bool skip_cow = keep_old;
1175

1176
    assert(nb_clusters <= s->l2_slice_size - l2_index);
1177

1178
    /* Check the type of all affected subclusters */
1179
    for (i = 0; i < nb_clusters; i++) {
1180
        l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
1181
        l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
1182
        if (skip_cow) {
1183
            unsigned write_from = MAX(cow_start_to, i << s->cluster_bits);
1184
            unsigned write_to = MIN(cow_end_from, (i + 1) << s->cluster_bits);
1185
            int first_sc = offset_to_sc_index(s, write_from);
1186
            int last_sc = offset_to_sc_index(s, write_to - 1);
1187
            int cnt = qcow2_get_subcluster_range_type(bs, l2_entry, l2_bitmap,
1188
                                                      first_sc, &type);
1189
            /* Is any of the subclusters of type != QCOW2_SUBCLUSTER_NORMAL ? */
1190
            if (type != QCOW2_SUBCLUSTER_NORMAL || first_sc + cnt <= last_sc) {
1191
                skip_cow = false;
1192
            }
1193
        } else {
1194
            /* If we can't skip the cow we can still look for invalid entries */
1195
            type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, 0);
1196
        }
1197
        if (type == QCOW2_SUBCLUSTER_INVALID) {
1198
            int l1_index = offset_to_l1_index(s, guest_offset);
1199
            uint64_t l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
1200
            qcow2_signal_corruption(bs, true, -1, -1, "Invalid cluster "
1201
                                    "entry found (L2 offset: %#" PRIx64
1202
                                    ", L2 index: %#x)",
1203
                                    l2_offset, l2_index + i);
1204
            return -EIO;
1205
        }
1206
    }
1207

1208
    if (skip_cow) {
1209
        return 0;
1210
    }
1211

1212
    /* Get the L2 entry of the first cluster */
1213
    l2_entry = get_l2_entry(s, l2_slice, l2_index);
1214
    l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
1215
    sc_index = offset_to_sc_index(s, guest_offset);
1216
    type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
1217

1218
    if (!keep_old) {
1219
        switch (type) {
1220
        case QCOW2_SUBCLUSTER_COMPRESSED:
1221
            cow_start_from = 0;
1222
            break;
1223
        case QCOW2_SUBCLUSTER_NORMAL:
1224
        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1225
        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1226
            if (has_subclusters(s)) {
1227
                /* Skip all leading zero and unallocated subclusters */
1228
                uint32_t alloc_bitmap = l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC;
1229
                cow_start_from =
1230
                    MIN(sc_index, ctz32(alloc_bitmap)) << s->subcluster_bits;
1231
            } else {
1232
                cow_start_from = 0;
1233
            }
1234
            break;
1235
        case QCOW2_SUBCLUSTER_ZERO_PLAIN:
1236
        case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
1237
            cow_start_from = sc_index << s->subcluster_bits;
1238
            break;
1239
        default:
1240
            g_assert_not_reached();
1241
        }
1242
    } else {
1243
        switch (type) {
1244
        case QCOW2_SUBCLUSTER_NORMAL:
1245
            cow_start_from = cow_start_to;
1246
            break;
1247
        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1248
        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1249
            cow_start_from = sc_index << s->subcluster_bits;
1250
            break;
1251
        default:
1252
            g_assert_not_reached();
1253
        }
1254
    }
1255

1256
    /* Get the L2 entry of the last cluster */
1257
    l2_index += nb_clusters - 1;
1258
    l2_entry = get_l2_entry(s, l2_slice, l2_index);
1259
    l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
1260
    sc_index = offset_to_sc_index(s, guest_offset + bytes - 1);
1261
    type = qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, sc_index);
1262

1263
    if (!keep_old) {
1264
        switch (type) {
1265
        case QCOW2_SUBCLUSTER_COMPRESSED:
1266
            cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
1267
            break;
1268
        case QCOW2_SUBCLUSTER_NORMAL:
1269
        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1270
        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1271
            cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
1272
            if (has_subclusters(s)) {
1273
                /* Skip all trailing zero and unallocated subclusters */
1274
                uint32_t alloc_bitmap = l2_bitmap & QCOW_L2_BITMAP_ALL_ALLOC;
1275
                cow_end_to -=
1276
                    MIN(s->subclusters_per_cluster - sc_index - 1,
1277
                        clz32(alloc_bitmap)) << s->subcluster_bits;
1278
            }
1279
            break;
1280
        case QCOW2_SUBCLUSTER_ZERO_PLAIN:
1281
        case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
1282
            cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
1283
            break;
1284
        default:
1285
            g_assert_not_reached();
1286
        }
1287
    } else {
1288
        switch (type) {
1289
        case QCOW2_SUBCLUSTER_NORMAL:
1290
            cow_end_to = cow_end_from;
1291
            break;
1292
        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
1293
        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
1294
            cow_end_to = ROUND_UP(cow_end_from, s->subcluster_size);
1295
            break;
1296
        default:
1297
            g_assert_not_reached();
1298
        }
1299
    }
1300

1301
    *m = g_malloc0(sizeof(**m));
1302
    **m = (QCowL2Meta) {
1303
        .next           = old_m,
1304

1305
        .alloc_offset   = host_cluster_offset,
1306
        .offset         = start_of_cluster(s, guest_offset),
1307
        .nb_clusters    = nb_clusters,
1308

1309
        .keep_old_clusters = keep_old,
1310

1311
        .cow_start = {
1312
            .offset     = cow_start_from,
1313
            .nb_bytes   = cow_start_to - cow_start_from,
1314
        },
1315
        .cow_end = {
1316
            .offset     = cow_end_from,
1317
            .nb_bytes   = cow_end_to - cow_end_from,
1318
        },
1319
    };
1320

1321
    qemu_co_queue_init(&(*m)->dependent_requests);
1322
    QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
1323

1324
    return 0;
1325
}
1326

1327
/*
1328
 * Returns true if writing to the cluster pointed to by @l2_entry
1329
 * requires a new allocation (that is, if the cluster is unallocated
1330
 * or has refcount > 1 and therefore cannot be written in-place).
1331
 */
1332
static bool GRAPH_RDLOCK
1333
cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry)
1334
{
1335
    switch (qcow2_get_cluster_type(bs, l2_entry)) {
1336
    case QCOW2_CLUSTER_NORMAL:
1337
    case QCOW2_CLUSTER_ZERO_ALLOC:
1338
        if (l2_entry & QCOW_OFLAG_COPIED) {
1339
            return false;
1340
        }
1341
        /* fallthrough */
1342
    case QCOW2_CLUSTER_UNALLOCATED:
1343
    case QCOW2_CLUSTER_COMPRESSED:
1344
    case QCOW2_CLUSTER_ZERO_PLAIN:
1345
        return true;
1346
    default:
1347
        abort();
1348
    }
1349
}
1350

1351
/*
1352
 * Returns the number of contiguous clusters that can be written to
1353
 * using one single write request, starting from @l2_index.
1354
 * At most @nb_clusters are checked.
1355
 *
1356
 * If @new_alloc is true this counts clusters that are either
1357
 * unallocated, or allocated but with refcount > 1 (so they need to be
1358
 * newly allocated and COWed).
1359
 *
1360
 * If @new_alloc is false this counts clusters that are already
1361
 * allocated and can be overwritten in-place (this includes clusters
1362
 * of type QCOW2_CLUSTER_ZERO_ALLOC).
1363
 */
1364
static int GRAPH_RDLOCK
1365
count_single_write_clusters(BlockDriverState *bs, int nb_clusters,
1366
                            uint64_t *l2_slice, int l2_index, bool new_alloc)
1367
{
1368
    BDRVQcow2State *s = bs->opaque;
1369
    uint64_t l2_entry = get_l2_entry(s, l2_slice, l2_index);
1370
    uint64_t expected_offset = l2_entry & L2E_OFFSET_MASK;
1371
    int i;
1372

1373
    for (i = 0; i < nb_clusters; i++) {
1374
        l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
1375
        if (cluster_needs_new_alloc(bs, l2_entry) != new_alloc) {
1376
            break;
1377
        }
1378
        if (!new_alloc) {
1379
            if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) {
1380
                break;
1381
            }
1382
            expected_offset += s->cluster_size;
1383
        }
1384
    }
1385

1386
    assert(i <= nb_clusters);
1387
    return i;
1388
}
1389

1390
/*
1391
 * Check if there already is an AIO write request in flight which allocates
1392
 * the same cluster. In this case we need to wait until the previous
1393
 * request has completed and updated the L2 table accordingly.
1394
 *
1395
 * Returns:
1396
 *   0       if there was no dependency. *cur_bytes indicates the number of
1397
 *           bytes from guest_offset that can be read before the next
1398
 *           dependency must be processed (or the request is complete)
1399
 *
1400
 *   -EAGAIN if we had to wait for another request, previously gathered
1401
 *           information on cluster allocation may be invalid now. The caller
1402
 *           must start over anyway, so consider *cur_bytes undefined.
1403
 */
1404
static int coroutine_fn handle_dependencies(BlockDriverState *bs,
1405
                                            uint64_t guest_offset,
1406
                                            uint64_t *cur_bytes, QCowL2Meta **m)
1407
{
1408
    BDRVQcow2State *s = bs->opaque;
1409
    QCowL2Meta *old_alloc;
1410
    uint64_t bytes = *cur_bytes;
1411

1412
    QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) {
1413

1414
        uint64_t start = guest_offset;
1415
        uint64_t end = start + bytes;
1416
        uint64_t old_start = start_of_cluster(s, l2meta_cow_start(old_alloc));
1417
        uint64_t old_end = ROUND_UP(l2meta_cow_end(old_alloc), s->cluster_size);
1418

1419
        if (end <= old_start || start >= old_end) {
1420
            /* No intersection */
1421
            continue;
1422
        }
1423

1424
        if (old_alloc->keep_old_clusters &&
1425
            (end <= l2meta_cow_start(old_alloc) ||
1426
             start >= l2meta_cow_end(old_alloc)))
1427
        {
1428
            /*
1429
             * Clusters intersect but COW areas don't. And cluster itself is
1430
             * already allocated. So, there is no actual conflict.
1431
             */
1432
            continue;
1433
        }
1434

1435
        /* Conflict */
1436

1437
        if (start < old_start) {
1438
            /* Stop at the start of a running allocation */
1439
            bytes = old_start - start;
1440
        } else {
1441
            bytes = 0;
1442
        }
1443

1444
        /*
1445
         * Stop if an l2meta already exists. After yielding, it wouldn't
1446
         * be valid any more, so we'd have to clean up the old L2Metas
1447
         * and deal with requests depending on them before starting to
1448
         * gather new ones. Not worth the trouble.
1449
         */
1450
        if (bytes == 0 && *m) {
1451
            *cur_bytes = 0;
1452
            return 0;
1453
        }
1454

1455
        if (bytes == 0) {
1456
            /*
1457
             * Wait for the dependency to complete. We need to recheck
1458
             * the free/allocated clusters when we continue.
1459
             */
1460
            qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
1461
            return -EAGAIN;
1462
        }
1463
    }
1464

1465
    /* Make sure that existing clusters and new allocations are only used up to
1466
     * the next dependency if we shortened the request above */
1467
    *cur_bytes = bytes;
1468

1469
    return 0;
1470
}
1471

1472
/*
1473
 * Checks how many already allocated clusters that don't require a new
1474
 * allocation there are at the given guest_offset (up to *bytes).
1475
 * If *host_offset is not INV_OFFSET, only physically contiguous clusters
1476
 * beginning at this host offset are counted.
1477
 *
1478
 * Note that guest_offset may not be cluster aligned. In this case, the
1479
 * returned *host_offset points to exact byte referenced by guest_offset and
1480
 * therefore isn't cluster aligned as well.
1481
 *
1482
 * Returns:
1483
 *   0:     if no allocated clusters are available at the given offset.
1484
 *          *bytes is normally unchanged. It is set to 0 if the cluster
1485
 *          is allocated and can be overwritten in-place but doesn't have
1486
 *          the right physical offset.
1487
 *
1488
 *   1:     if allocated clusters that can be overwritten in place are
1489
 *          available at the requested offset. *bytes may have decreased
1490
 *          and describes the length of the area that can be written to.
1491
 *
1492
 *  -errno: in error cases
1493
 */
1494
static int coroutine_fn GRAPH_RDLOCK
1495
handle_copied(BlockDriverState *bs, uint64_t guest_offset,
1496
              uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
1497
{
1498
    BDRVQcow2State *s = bs->opaque;
1499
    int l2_index;
1500
    uint64_t l2_entry, cluster_offset;
1501
    uint64_t *l2_slice;
1502
    uint64_t nb_clusters;
1503
    unsigned int keep_clusters;
1504
    int ret;
1505

1506
    trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset,
1507
                              *bytes);
1508

1509
    assert(*host_offset == INV_OFFSET || offset_into_cluster(s, guest_offset)
1510
                                      == offset_into_cluster(s, *host_offset));
1511

1512
    /*
1513
     * Calculate the number of clusters to look for. We stop at L2 slice
1514
     * boundaries to keep things simple.
1515
     */
1516
    nb_clusters =
1517
        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1518

1519
    l2_index = offset_to_l2_slice_index(s, guest_offset);
1520
    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
1521
    /* Limit total byte count to BDRV_REQUEST_MAX_BYTES */
1522
    nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
1523

1524
    /* Find L2 entry for the first involved cluster */
1525
    ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
1526
    if (ret < 0) {
1527
        return ret;
1528
    }
1529

1530
    l2_entry = get_l2_entry(s, l2_slice, l2_index);
1531
    cluster_offset = l2_entry & L2E_OFFSET_MASK;
1532

1533
    if (!cluster_needs_new_alloc(bs, l2_entry)) {
1534
        if (offset_into_cluster(s, cluster_offset)) {
1535
            qcow2_signal_corruption(bs, true, -1, -1, "%s cluster offset "
1536
                                    "%#" PRIx64 " unaligned (guest offset: %#"
1537
                                    PRIx64 ")", l2_entry & QCOW_OFLAG_ZERO ?
1538
                                    "Preallocated zero" : "Data",
1539
                                    cluster_offset, guest_offset);
1540
            ret = -EIO;
1541
            goto out;
1542
        }
1543

1544
        /* If a specific host_offset is required, check it */
1545
        if (*host_offset != INV_OFFSET && cluster_offset != *host_offset) {
1546
            *bytes = 0;
1547
            ret = 0;
1548
            goto out;
1549
        }
1550

1551
        /* We keep all QCOW_OFLAG_COPIED clusters */
1552
        keep_clusters = count_single_write_clusters(bs, nb_clusters, l2_slice,
1553
                                                    l2_index, false);
1554
        assert(keep_clusters <= nb_clusters);
1555

1556
        *bytes = MIN(*bytes,
1557
                 keep_clusters * s->cluster_size
1558
                 - offset_into_cluster(s, guest_offset));
1559
        assert(*bytes != 0);
1560

1561
        ret = calculate_l2_meta(bs, cluster_offset, guest_offset,
1562
                                *bytes, l2_slice, m, true);
1563
        if (ret < 0) {
1564
            goto out;
1565
        }
1566

1567
        ret = 1;
1568
    } else {
1569
        ret = 0;
1570
    }
1571

1572
    /* Cleanup */
1573
out:
1574
    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1575

1576
    /* Only return a host offset if we actually made progress. Otherwise we
1577
     * would make requirements for handle_alloc() that it can't fulfill */
1578
    if (ret > 0) {
1579
        *host_offset = cluster_offset + offset_into_cluster(s, guest_offset);
1580
    }
1581

1582
    return ret;
1583
}
1584

1585
/*
1586
 * Allocates new clusters for the given guest_offset.
1587
 *
1588
 * At most *nb_clusters are allocated, and on return *nb_clusters is updated to
1589
 * contain the number of clusters that have been allocated and are contiguous
1590
 * in the image file.
1591
 *
1592
 * If *host_offset is not INV_OFFSET, it specifies the offset in the image file
1593
 * at which the new clusters must start. *nb_clusters can be 0 on return in
1594
 * this case if the cluster at host_offset is already in use. If *host_offset
1595
 * is INV_OFFSET, the clusters can be allocated anywhere in the image file.
1596
 *
1597
 * *host_offset is updated to contain the offset into the image file at which
1598
 * the first allocated cluster starts.
1599
 *
1600
 * Return 0 on success and -errno in error cases. -EAGAIN means that the
1601
 * function has been waiting for another request and the allocation must be
1602
 * restarted, but the whole request should not be failed.
1603
 */
1604
static int coroutine_fn GRAPH_RDLOCK
1605
do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
1606
                        uint64_t *host_offset, uint64_t *nb_clusters)
1607
{
1608
    BDRVQcow2State *s = bs->opaque;
1609

1610
    trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset,
1611
                                         *host_offset, *nb_clusters);
1612

1613
    if (has_data_file(bs)) {
1614
        assert(*host_offset == INV_OFFSET ||
1615
               *host_offset == start_of_cluster(s, guest_offset));
1616
        *host_offset = start_of_cluster(s, guest_offset);
1617
        return 0;
1618
    }
1619

1620
    /* Allocate new clusters */
1621
    trace_qcow2_cluster_alloc_phys(qemu_coroutine_self());
1622
    if (*host_offset == INV_OFFSET) {
1623
        int64_t cluster_offset =
1624
            qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size);
1625
        if (cluster_offset < 0) {
1626
            return cluster_offset;
1627
        }
1628
        *host_offset = cluster_offset;
1629
        return 0;
1630
    } else {
1631
        int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters);
1632
        if (ret < 0) {
1633
            return ret;
1634
        }
1635
        *nb_clusters = ret;
1636
        return 0;
1637
    }
1638
}
1639

1640
/*
1641
 * Allocates new clusters for an area that is either still unallocated or
1642
 * cannot be overwritten in-place. If *host_offset is not INV_OFFSET,
1643
 * clusters are only allocated if the new allocation can match the specified
1644
 * host offset.
1645
 *
1646
 * Note that guest_offset may not be cluster aligned. In this case, the
1647
 * returned *host_offset points to exact byte referenced by guest_offset and
1648
 * therefore isn't cluster aligned as well.
1649
 *
1650
 * Returns:
1651
 *   0:     if no clusters could be allocated. *bytes is set to 0,
1652
 *          *host_offset is left unchanged.
1653
 *
1654
 *   1:     if new clusters were allocated. *bytes may be decreased if the
1655
 *          new allocation doesn't cover all of the requested area.
1656
 *          *host_offset is updated to contain the host offset of the first
1657
 *          newly allocated cluster.
1658
 *
1659
 *  -errno: in error cases
1660
 */
1661
static int coroutine_fn GRAPH_RDLOCK
1662
handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
1663
             uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m)
1664
{
1665
    BDRVQcow2State *s = bs->opaque;
1666
    int l2_index;
1667
    uint64_t *l2_slice;
1668
    uint64_t nb_clusters;
1669
    int ret;
1670

1671
    uint64_t alloc_cluster_offset;
1672

1673
    trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
1674
                             *bytes);
1675
    assert(*bytes > 0);
1676

1677
    /*
1678
     * Calculate the number of clusters to look for. We stop at L2 slice
1679
     * boundaries to keep things simple.
1680
     */
1681
    nb_clusters =
1682
        size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes);
1683

1684
    l2_index = offset_to_l2_slice_index(s, guest_offset);
1685
    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
1686
    /* Limit total allocation byte count to BDRV_REQUEST_MAX_BYTES */
1687
    nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
1688

1689
    /* Find L2 entry for the first involved cluster */
1690
    ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
1691
    if (ret < 0) {
1692
        return ret;
1693
    }
1694

1695
    nb_clusters = count_single_write_clusters(bs, nb_clusters,
1696
                                              l2_slice, l2_index, true);
1697

1698
    /* This function is only called when there were no non-COW clusters, so if
1699
     * we can't find any unallocated or COW clusters either, something is
1700
     * wrong with our code. */
1701
    assert(nb_clusters > 0);
1702

1703
    /* Allocate at a given offset in the image file */
1704
    alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET :
1705
        start_of_cluster(s, *host_offset);
1706
    ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
1707
                                  &nb_clusters);
1708
    if (ret < 0) {
1709
        goto out;
1710
    }
1711

1712
    /* Can't extend contiguous allocation */
1713
    if (nb_clusters == 0) {
1714
        *bytes = 0;
1715
        ret = 0;
1716
        goto out;
1717
    }
1718

1719
    assert(alloc_cluster_offset != INV_OFFSET);
1720

1721
    /*
1722
     * Save info needed for meta data update.
1723
     *
1724
     * requested_bytes: Number of bytes from the start of the first
1725
     * newly allocated cluster to the end of the (possibly shortened
1726
     * before) write request.
1727
     *
1728
     * avail_bytes: Number of bytes from the start of the first
1729
     * newly allocated to the end of the last newly allocated cluster.
1730
     *
1731
     * nb_bytes: The number of bytes from the start of the first
1732
     * newly allocated cluster to the end of the area that the write
1733
     * request actually writes to (excluding COW at the end)
1734
     */
1735
    uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset);
1736
    int avail_bytes = nb_clusters << s->cluster_bits;
1737
    int nb_bytes = MIN(requested_bytes, avail_bytes);
1738

1739
    *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
1740
    *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
1741
    assert(*bytes != 0);
1742

1743
    ret = calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes,
1744
                            l2_slice, m, false);
1745
    if (ret < 0) {
1746
        goto out;
1747
    }
1748

1749
    ret = 1;
1750

1751
out:
1752
    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1753
    return ret;
1754
}
1755

1756
/*
1757
 * For a given area on the virtual disk defined by @offset and @bytes,
1758
 * find the corresponding area on the qcow2 image, allocating new
1759
 * clusters (or subclusters) if necessary. The result can span a
1760
 * combination of allocated and previously unallocated clusters.
1761
 *
1762
 * Note that offset may not be cluster aligned. In this case, the returned
1763
 * *host_offset points to exact byte referenced by offset and therefore
1764
 * isn't cluster aligned as well.
1765
 *
1766
 * On return, @host_offset is set to the beginning of the requested
1767
 * area. This area is guaranteed to be contiguous on the qcow2 file
1768
 * but it can be smaller than initially requested. In this case @bytes
1769
 * is updated with the actual size.
1770
 *
1771
 * If any clusters or subclusters were allocated then @m contains a
1772
 * list with the information of all the affected regions. Note that
1773
 * this can happen regardless of whether this function succeeds or
1774
 * not. The caller is responsible for updating the L2 metadata of the
1775
 * allocated clusters (on success) or freeing them (on failure), and
1776
 * for clearing the contents of @m afterwards in both cases.
1777
 *
1778
 * If the request conflicts with another write request in flight, the coroutine
1779
 * is queued and will be reentered when the dependency has completed.
1780
 *
1781
 * Return 0 on success and -errno in error cases
1782
 */
1783
int coroutine_fn qcow2_alloc_host_offset(BlockDriverState *bs, uint64_t offset,
1784
                                         unsigned int *bytes,
1785
                                         uint64_t *host_offset,
1786
                                         QCowL2Meta **m)
1787
{
1788
    BDRVQcow2State *s = bs->opaque;
1789
    uint64_t start, remaining;
1790
    uint64_t cluster_offset;
1791
    uint64_t cur_bytes;
1792
    int ret;
1793

1794
    trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes);
1795

1796
again:
1797
    start = offset;
1798
    remaining = *bytes;
1799
    cluster_offset = INV_OFFSET;
1800
    *host_offset = INV_OFFSET;
1801
    cur_bytes = 0;
1802
    *m = NULL;
1803

1804
    while (true) {
1805

1806
        if (*host_offset == INV_OFFSET && cluster_offset != INV_OFFSET) {
1807
            *host_offset = cluster_offset;
1808
        }
1809

1810
        assert(remaining >= cur_bytes);
1811

1812
        start           += cur_bytes;
1813
        remaining       -= cur_bytes;
1814

1815
        if (cluster_offset != INV_OFFSET) {
1816
            cluster_offset += cur_bytes;
1817
        }
1818

1819
        if (remaining == 0) {
1820
            break;
1821
        }
1822

1823
        cur_bytes = remaining;
1824

1825
        /*
1826
         * Now start gathering as many contiguous clusters as possible:
1827
         *
1828
         * 1. Check for overlaps with in-flight allocations
1829
         *
1830
         *      a) Overlap not in the first cluster -> shorten this request and
1831
         *         let the caller handle the rest in its next loop iteration.
1832
         *
1833
         *      b) Real overlaps of two requests. Yield and restart the search
1834
         *         for contiguous clusters (the situation could have changed
1835
         *         while we were sleeping)
1836
         *
1837
         *      c) TODO: Request starts in the same cluster as the in-flight
1838
         *         allocation ends. Shorten the COW of the in-fight allocation,
1839
         *         set cluster_offset to write to the same cluster and set up
1840
         *         the right synchronisation between the in-flight request and
1841
         *         the new one.
1842
         */
1843
        ret = handle_dependencies(bs, start, &cur_bytes, m);
1844
        if (ret == -EAGAIN) {
1845
            /* Currently handle_dependencies() doesn't yield if we already had
1846
             * an allocation. If it did, we would have to clean up the L2Meta
1847
             * structs before starting over. */
1848
            assert(*m == NULL);
1849
            goto again;
1850
        } else if (ret < 0) {
1851
            return ret;
1852
        } else if (cur_bytes == 0) {
1853
            break;
1854
        } else {
1855
            /* handle_dependencies() may have decreased cur_bytes (shortened
1856
             * the allocations below) so that the next dependency is processed
1857
             * correctly during the next loop iteration. */
1858
        }
1859

1860
        /*
1861
         * 2. Count contiguous COPIED clusters.
1862
         */
1863
        ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m);
1864
        if (ret < 0) {
1865
            return ret;
1866
        } else if (ret) {
1867
            continue;
1868
        } else if (cur_bytes == 0) {
1869
            break;
1870
        }
1871

1872
        /*
1873
         * 3. If the request still hasn't completed, allocate new clusters,
1874
         *    considering any cluster_offset of steps 1c or 2.
1875
         */
1876
        ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m);
1877
        if (ret < 0) {
1878
            return ret;
1879
        } else if (ret) {
1880
            continue;
1881
        } else {
1882
            assert(cur_bytes == 0);
1883
            break;
1884
        }
1885
    }
1886

1887
    *bytes -= remaining;
1888
    assert(*bytes > 0);
1889
    assert(*host_offset != INV_OFFSET);
1890
    assert(offset_into_cluster(s, *host_offset) ==
1891
           offset_into_cluster(s, offset));
1892

1893
    return 0;
1894
}
1895

1896
/*
1897
 * This discards as many clusters of nb_clusters as possible at once (i.e.
1898
 * all clusters in the same L2 slice) and returns the number of discarded
1899
 * clusters.
1900
 */
1901
static int GRAPH_RDLOCK
1902
discard_in_l2_slice(BlockDriverState *bs, uint64_t offset, uint64_t nb_clusters,
1903
                    enum qcow2_discard_type type, bool full_discard)
1904
{
1905
    BDRVQcow2State *s = bs->opaque;
1906
    uint64_t *l2_slice;
1907
    int l2_index;
1908
    int ret;
1909
    int i;
1910

1911
    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
1912
    if (ret < 0) {
1913
        return ret;
1914
    }
1915

1916
    /* Limit nb_clusters to one L2 slice */
1917
    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
1918
    assert(nb_clusters <= INT_MAX);
1919

1920
    for (i = 0; i < nb_clusters; i++) {
1921
        uint64_t old_l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
1922
        uint64_t old_l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
1923
        uint64_t new_l2_entry = old_l2_entry;
1924
        uint64_t new_l2_bitmap = old_l2_bitmap;
1925
        QCow2ClusterType cluster_type =
1926
            qcow2_get_cluster_type(bs, old_l2_entry);
1927
        bool keep_reference = (cluster_type != QCOW2_CLUSTER_COMPRESSED) &&
1928
                              !full_discard &&
1929
                              (s->discard_no_unref &&
1930
                               type == QCOW2_DISCARD_REQUEST);
1931

1932
        /*
1933
         * If full_discard is true, the cluster should not read back as zeroes,
1934
         * but rather fall through to the backing file.
1935
         *
1936
         * If full_discard is false, make sure that a discarded area reads back
1937
         * as zeroes for v3 images (we cannot do it for v2 without actually
1938
         * writing a zero-filled buffer). We can skip the operation if the
1939
         * cluster is already marked as zero, or if it's unallocated and we
1940
         * don't have a backing file.
1941
         *
1942
         * TODO We might want to use bdrv_block_status(bs) here, but we're
1943
         * holding s->lock, so that doesn't work today.
1944
         */
1945
        if (full_discard) {
1946
            new_l2_entry = new_l2_bitmap = 0;
1947
        } else if (bs->backing || qcow2_cluster_is_allocated(cluster_type)) {
1948
            if (has_subclusters(s)) {
1949
                if (keep_reference) {
1950
                    new_l2_entry = old_l2_entry;
1951
                } else {
1952
                    new_l2_entry = 0;
1953
                }
1954
                new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
1955
            } else {
1956
                if (s->qcow_version >= 3) {
1957
                    if (keep_reference) {
1958
                        new_l2_entry |= QCOW_OFLAG_ZERO;
1959
                    } else {
1960
                        new_l2_entry = QCOW_OFLAG_ZERO;
1961
                    }
1962
                } else {
1963
                    new_l2_entry = 0;
1964
                }
1965
            }
1966
        }
1967

1968
        if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) {
1969
            continue;
1970
        }
1971

1972
        /* First remove L2 entries */
1973
        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
1974
        set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
1975
        if (has_subclusters(s)) {
1976
            set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
1977
        }
1978
        if (!keep_reference) {
1979
            /* Then decrease the refcount */
1980
            qcow2_free_any_cluster(bs, old_l2_entry, type);
1981
        } else if (s->discard_passthrough[type] &&
1982
                   (cluster_type == QCOW2_CLUSTER_NORMAL ||
1983
                    cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) {
1984
            /* If we keep the reference, pass on the discard still */
1985
            bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
1986
                          s->cluster_size);
1987
        }
1988
    }
1989

1990
    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
1991

1992
    return nb_clusters;
1993
}
1994

1995
int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
1996
                          uint64_t bytes, enum qcow2_discard_type type,
1997
                          bool full_discard)
1998
{
1999
    BDRVQcow2State *s = bs->opaque;
2000
    uint64_t end_offset = offset + bytes;
2001
    uint64_t nb_clusters;
2002
    int64_t cleared;
2003
    int ret;
2004

2005
    /* Caller must pass aligned values, except at image end */
2006
    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
2007
    assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
2008
           end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
2009

2010
    nb_clusters = size_to_clusters(s, bytes);
2011

2012
    s->cache_discards = true;
2013

2014
    /* Each L2 slice is handled by its own loop iteration */
2015
    while (nb_clusters > 0) {
2016
        cleared = discard_in_l2_slice(bs, offset, nb_clusters, type,
2017
                                      full_discard);
2018
        if (cleared < 0) {
2019
            ret = cleared;
2020
            goto fail;
2021
        }
2022

2023
        nb_clusters -= cleared;
2024
        offset += (cleared * s->cluster_size);
2025
    }
2026

2027
    ret = 0;
2028
fail:
2029
    s->cache_discards = false;
2030
    qcow2_process_discards(bs, ret);
2031

2032
    return ret;
2033
}
2034

2035
/*
2036
 * This zeroes as many clusters of nb_clusters as possible at once (i.e.
2037
 * all clusters in the same L2 slice) and returns the number of zeroed
2038
 * clusters.
2039
 */
2040
static int coroutine_fn GRAPH_RDLOCK
2041
zero_in_l2_slice(BlockDriverState *bs, uint64_t offset,
2042
                 uint64_t nb_clusters, int flags)
2043
{
2044
    BDRVQcow2State *s = bs->opaque;
2045
    uint64_t *l2_slice;
2046
    int l2_index;
2047
    int ret;
2048
    int i;
2049

2050
    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
2051
    if (ret < 0) {
2052
        return ret;
2053
    }
2054

2055
    /* Limit nb_clusters to one L2 slice */
2056
    nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
2057
    assert(nb_clusters <= INT_MAX);
2058

2059
    for (i = 0; i < nb_clusters; i++) {
2060
        uint64_t old_l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
2061
        uint64_t old_l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
2062
        QCow2ClusterType type = qcow2_get_cluster_type(bs, old_l2_entry);
2063
        bool unmap = (type == QCOW2_CLUSTER_COMPRESSED) ||
2064
            ((flags & BDRV_REQ_MAY_UNMAP) && qcow2_cluster_is_allocated(type));
2065
        bool keep_reference =
2066
            (s->discard_no_unref && type != QCOW2_CLUSTER_COMPRESSED);
2067
        uint64_t new_l2_entry = old_l2_entry;
2068
        uint64_t new_l2_bitmap = old_l2_bitmap;
2069

2070
        if (unmap && !keep_reference) {
2071
            new_l2_entry = 0;
2072
        }
2073

2074
        if (has_subclusters(s)) {
2075
            new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
2076
        } else {
2077
            new_l2_entry |= QCOW_OFLAG_ZERO;
2078
        }
2079

2080
        if (old_l2_entry == new_l2_entry && old_l2_bitmap == new_l2_bitmap) {
2081
            continue;
2082
        }
2083

2084
        /* First update L2 entries */
2085
        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
2086
        set_l2_entry(s, l2_slice, l2_index + i, new_l2_entry);
2087
        if (has_subclusters(s)) {
2088
            set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
2089
        }
2090

2091
        if (unmap) {
2092
            if (!keep_reference) {
2093
                /* Then decrease the refcount */
2094
                qcow2_free_any_cluster(bs, old_l2_entry, QCOW2_DISCARD_REQUEST);
2095
            } else if (s->discard_passthrough[QCOW2_DISCARD_REQUEST] &&
2096
                       (type == QCOW2_CLUSTER_NORMAL ||
2097
                        type == QCOW2_CLUSTER_ZERO_ALLOC)) {
2098
                /* If we keep the reference, pass on the discard still */
2099
                bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
2100
                            s->cluster_size);
2101
            }
2102
        }
2103
    }
2104

2105
    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
2106

2107
    return nb_clusters;
2108
}
2109

2110
static int coroutine_fn GRAPH_RDLOCK
2111
zero_l2_subclusters(BlockDriverState *bs, uint64_t offset,
2112
                    unsigned nb_subclusters)
2113
{
2114
    BDRVQcow2State *s = bs->opaque;
2115
    uint64_t *l2_slice;
2116
    uint64_t old_l2_bitmap, l2_bitmap;
2117
    int l2_index, ret, sc = offset_to_sc_index(s, offset);
2118

2119
    /* For full clusters use zero_in_l2_slice() instead */
2120
    assert(nb_subclusters > 0 && nb_subclusters < s->subclusters_per_cluster);
2121
    assert(sc + nb_subclusters <= s->subclusters_per_cluster);
2122
    assert(offset_into_subcluster(s, offset) == 0);
2123

2124
    ret = get_cluster_table(bs, offset, &l2_slice, &l2_index);
2125
    if (ret < 0) {
2126
        return ret;
2127
    }
2128

2129
    switch (qcow2_get_cluster_type(bs, get_l2_entry(s, l2_slice, l2_index))) {
2130
    case QCOW2_CLUSTER_COMPRESSED:
2131
        ret = -ENOTSUP; /* We cannot partially zeroize compressed clusters */
2132
        goto out;
2133
    case QCOW2_CLUSTER_NORMAL:
2134
    case QCOW2_CLUSTER_UNALLOCATED:
2135
        break;
2136
    default:
2137
        g_assert_not_reached();
2138
    }
2139

2140
    old_l2_bitmap = l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index);
2141

2142
    l2_bitmap |=  QCOW_OFLAG_SUB_ZERO_RANGE(sc, sc + nb_subclusters);
2143
    l2_bitmap &= ~QCOW_OFLAG_SUB_ALLOC_RANGE(sc, sc + nb_subclusters);
2144

2145
    if (old_l2_bitmap != l2_bitmap) {
2146
        set_l2_bitmap(s, l2_slice, l2_index, l2_bitmap);
2147
        qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
2148
    }
2149

2150
    ret = 0;
2151
out:
2152
    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
2153

2154
    return ret;
2155
}
2156

2157
int coroutine_fn qcow2_subcluster_zeroize(BlockDriverState *bs, uint64_t offset,
2158
                                          uint64_t bytes, int flags)
2159
{
2160
    BDRVQcow2State *s = bs->opaque;
2161
    uint64_t end_offset = offset + bytes;
2162
    uint64_t nb_clusters;
2163
    unsigned head, tail;
2164
    int64_t cleared;
2165
    int ret;
2166

2167
    /* If we have to stay in sync with an external data file, zero out
2168
     * s->data_file first. */
2169
    if (data_file_is_raw(bs)) {
2170
        assert(has_data_file(bs));
2171
        ret = bdrv_co_pwrite_zeroes(s->data_file, offset, bytes, flags);
2172
        if (ret < 0) {
2173
            return ret;
2174
        }
2175
    }
2176

2177
    /* Caller must pass aligned values, except at image end */
2178
    assert(offset_into_subcluster(s, offset) == 0);
2179
    assert(offset_into_subcluster(s, end_offset) == 0 ||
2180
           end_offset >= bs->total_sectors << BDRV_SECTOR_BITS);
2181

2182
    /*
2183
     * The zero flag is only supported by version 3 and newer. However, if we
2184
     * have no backing file, we can resort to discard in version 2.
2185
     */
2186
    if (s->qcow_version < 3) {
2187
        if (!bs->backing) {
2188
            return qcow2_cluster_discard(bs, offset, bytes,
2189
                                         QCOW2_DISCARD_REQUEST, false);
2190
        }
2191
        return -ENOTSUP;
2192
    }
2193

2194
    head = MIN(end_offset, ROUND_UP(offset, s->cluster_size)) - offset;
2195
    offset += head;
2196

2197
    tail = (end_offset >= bs->total_sectors << BDRV_SECTOR_BITS) ? 0 :
2198
        end_offset - MAX(offset, start_of_cluster(s, end_offset));
2199
    end_offset -= tail;
2200

2201
    s->cache_discards = true;
2202

2203
    if (head) {
2204
        ret = zero_l2_subclusters(bs, offset - head,
2205
                                  size_to_subclusters(s, head));
2206
        if (ret < 0) {
2207
            goto fail;
2208
        }
2209
    }
2210

2211
    /* Each L2 slice is handled by its own loop iteration */
2212
    nb_clusters = size_to_clusters(s, end_offset - offset);
2213

2214
    while (nb_clusters > 0) {
2215
        cleared = zero_in_l2_slice(bs, offset, nb_clusters, flags);
2216
        if (cleared < 0) {
2217
            ret = cleared;
2218
            goto fail;
2219
        }
2220

2221
        nb_clusters -= cleared;
2222
        offset += (cleared * s->cluster_size);
2223
    }
2224

2225
    if (tail) {
2226
        ret = zero_l2_subclusters(bs, end_offset, size_to_subclusters(s, tail));
2227
        if (ret < 0) {
2228
            goto fail;
2229
        }
2230
    }
2231

2232
    ret = 0;
2233
fail:
2234
    s->cache_discards = false;
2235
    qcow2_process_discards(bs, ret);
2236

2237
    return ret;
2238
}
2239

2240
/*
2241
 * Expands all zero clusters in a specific L1 table (or deallocates them, for
2242
 * non-backed non-pre-allocated zero clusters).
2243
 *
2244
 * l1_entries and *visited_l1_entries are used to keep track of progress for
2245
 * status_cb(). l1_entries contains the total number of L1 entries and
2246
 * *visited_l1_entries counts all visited L1 entries.
2247
 */
2248
static int GRAPH_RDLOCK
2249
expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
2250
                           int l1_size, int64_t *visited_l1_entries,
2251
                           int64_t l1_entries,
2252
                           BlockDriverAmendStatusCB *status_cb,
2253
                           void *cb_opaque)
2254
{
2255
    BDRVQcow2State *s = bs->opaque;
2256
    bool is_active_l1 = (l1_table == s->l1_table);
2257
    uint64_t *l2_slice = NULL;
2258
    unsigned slice, slice_size2, n_slices;
2259
    int ret;
2260
    int i, j;
2261

2262
    /* qcow2_downgrade() is not allowed in images with subclusters */
2263
    assert(!has_subclusters(s));
2264

2265
    slice_size2 = s->l2_slice_size * l2_entry_size(s);
2266
    n_slices = s->cluster_size / slice_size2;
2267

2268
    if (!is_active_l1) {
2269
        /* inactive L2 tables require a buffer to be stored in when loading
2270
         * them from disk */
2271
        l2_slice = qemu_try_blockalign(bs->file->bs, slice_size2);
2272
        if (l2_slice == NULL) {
2273
            return -ENOMEM;
2274
        }
2275
    }
2276

2277
    for (i = 0; i < l1_size; i++) {
2278
        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
2279
        uint64_t l2_refcount;
2280

2281
        if (!l2_offset) {
2282
            /* unallocated */
2283
            (*visited_l1_entries)++;
2284
            if (status_cb) {
2285
                status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
2286
            }
2287
            continue;
2288
        }
2289

2290
        if (offset_into_cluster(s, l2_offset)) {
2291
            qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
2292
                                    PRIx64 " unaligned (L1 index: %#x)",
2293
                                    l2_offset, i);
2294
            ret = -EIO;
2295
            goto fail;
2296
        }
2297

2298
        ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
2299
                                 &l2_refcount);
2300
        if (ret < 0) {
2301
            goto fail;
2302
        }
2303

2304
        for (slice = 0; slice < n_slices; slice++) {
2305
            uint64_t slice_offset = l2_offset + slice * slice_size2;
2306
            bool l2_dirty = false;
2307
            if (is_active_l1) {
2308
                /* get active L2 tables from cache */
2309
                ret = qcow2_cache_get(bs, s->l2_table_cache, slice_offset,
2310
                                      (void **)&l2_slice);
2311
            } else {
2312
                /* load inactive L2 tables from disk */
2313
                ret = bdrv_pread(bs->file, slice_offset, slice_size2,
2314
                                 l2_slice, 0);
2315
            }
2316
            if (ret < 0) {
2317
                goto fail;
2318
            }
2319

2320
            for (j = 0; j < s->l2_slice_size; j++) {
2321
                uint64_t l2_entry = get_l2_entry(s, l2_slice, j);
2322
                int64_t offset = l2_entry & L2E_OFFSET_MASK;
2323
                QCow2ClusterType cluster_type =
2324
                    qcow2_get_cluster_type(bs, l2_entry);
2325

2326
                if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN &&
2327
                    cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) {
2328
                    continue;
2329
                }
2330

2331
                if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
2332
                    if (!bs->backing) {
2333
                        /*
2334
                         * not backed; therefore we can simply deallocate the
2335
                         * cluster. No need to call set_l2_bitmap(), this
2336
                         * function doesn't support images with subclusters.
2337
                         */
2338
                        set_l2_entry(s, l2_slice, j, 0);
2339
                        l2_dirty = true;
2340
                        continue;
2341
                    }
2342

2343
                    offset = qcow2_alloc_clusters(bs, s->cluster_size);
2344
                    if (offset < 0) {
2345
                        ret = offset;
2346
                        goto fail;
2347
                    }
2348

2349
                    /* The offset must fit in the offset field */
2350
                    assert((offset & L2E_OFFSET_MASK) == offset);
2351

2352
                    if (l2_refcount > 1) {
2353
                        /* For shared L2 tables, set the refcount accordingly
2354
                         * (it is already 1 and needs to be l2_refcount) */
2355
                        ret = qcow2_update_cluster_refcount(
2356
                            bs, offset >> s->cluster_bits,
2357
                            refcount_diff(1, l2_refcount), false,
2358
                            QCOW2_DISCARD_OTHER);
2359
                        if (ret < 0) {
2360
                            qcow2_free_clusters(bs, offset, s->cluster_size,
2361
                                                QCOW2_DISCARD_OTHER);
2362
                            goto fail;
2363
                        }
2364
                    }
2365
                }
2366

2367
                if (offset_into_cluster(s, offset)) {
2368
                    int l2_index = slice * s->l2_slice_size + j;
2369
                    qcow2_signal_corruption(
2370
                        bs, true, -1, -1,
2371
                        "Cluster allocation offset "
2372
                        "%#" PRIx64 " unaligned (L2 offset: %#"
2373
                        PRIx64 ", L2 index: %#x)", offset,
2374
                        l2_offset, l2_index);
2375
                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
2376
                        qcow2_free_clusters(bs, offset, s->cluster_size,
2377
                                            QCOW2_DISCARD_ALWAYS);
2378
                    }
2379
                    ret = -EIO;
2380
                    goto fail;
2381
                }
2382

2383
                ret = qcow2_pre_write_overlap_check(bs, 0, offset,
2384
                                                    s->cluster_size, true);
2385
                if (ret < 0) {
2386
                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
2387
                        qcow2_free_clusters(bs, offset, s->cluster_size,
2388
                                            QCOW2_DISCARD_ALWAYS);
2389
                    }
2390
                    goto fail;
2391
                }
2392

2393
                ret = bdrv_pwrite_zeroes(s->data_file, offset,
2394
                                         s->cluster_size, 0);
2395
                if (ret < 0) {
2396
                    if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
2397
                        qcow2_free_clusters(bs, offset, s->cluster_size,
2398
                                            QCOW2_DISCARD_ALWAYS);
2399
                    }
2400
                    goto fail;
2401
                }
2402

2403
                if (l2_refcount == 1) {
2404
                    set_l2_entry(s, l2_slice, j, offset | QCOW_OFLAG_COPIED);
2405
                } else {
2406
                    set_l2_entry(s, l2_slice, j, offset);
2407
                }
2408
                /*
2409
                 * No need to call set_l2_bitmap() after set_l2_entry() because
2410
                 * this function doesn't support images with subclusters.
2411
                 */
2412
                l2_dirty = true;
2413
            }
2414

2415
            if (is_active_l1) {
2416
                if (l2_dirty) {
2417
                    qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice);
2418
                    qcow2_cache_depends_on_flush(s->l2_table_cache);
2419
                }
2420
                qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
2421
            } else {
2422
                if (l2_dirty) {
2423
                    ret = qcow2_pre_write_overlap_check(
2424
                        bs, QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2,
2425
                        slice_offset, slice_size2, false);
2426
                    if (ret < 0) {
2427
                        goto fail;
2428
                    }
2429

2430
                    ret = bdrv_pwrite(bs->file, slice_offset, slice_size2,
2431
                                      l2_slice, 0);
2432
                    if (ret < 0) {
2433
                        goto fail;
2434
                    }
2435
                }
2436
            }
2437
        }
2438

2439
        (*visited_l1_entries)++;
2440
        if (status_cb) {
2441
            status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque);
2442
        }
2443
    }
2444

2445
    ret = 0;
2446

2447
fail:
2448
    if (l2_slice) {
2449
        if (!is_active_l1) {
2450
            qemu_vfree(l2_slice);
2451
        } else {
2452
            qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
2453
        }
2454
    }
2455
    return ret;
2456
}
2457

2458
/*
2459
 * For backed images, expands all zero clusters on the image. For non-backed
2460
 * images, deallocates all non-pre-allocated zero clusters (and claims the
2461
 * allocation for pre-allocated ones). This is important for downgrading to a
2462
 * qcow2 version which doesn't yet support metadata zero clusters.
2463
 */
2464
int qcow2_expand_zero_clusters(BlockDriverState *bs,
2465
                               BlockDriverAmendStatusCB *status_cb,
2466
                               void *cb_opaque)
2467
{
2468
    BDRVQcow2State *s = bs->opaque;
2469
    uint64_t *l1_table = NULL;
2470
    int64_t l1_entries = 0, visited_l1_entries = 0;
2471
    int ret;
2472
    int i, j;
2473

2474
    if (status_cb) {
2475
        l1_entries = s->l1_size;
2476
        for (i = 0; i < s->nb_snapshots; i++) {
2477
            l1_entries += s->snapshots[i].l1_size;
2478
        }
2479
    }
2480

2481
    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
2482
                                     &visited_l1_entries, l1_entries,
2483
                                     status_cb, cb_opaque);
2484
    if (ret < 0) {
2485
        goto fail;
2486
    }
2487

2488
    /* Inactive L1 tables may point to active L2 tables - therefore it is
2489
     * necessary to flush the L2 table cache before trying to access the L2
2490
     * tables pointed to by inactive L1 entries (else we might try to expand
2491
     * zero clusters that have already been expanded); furthermore, it is also
2492
     * necessary to empty the L2 table cache, since it may contain tables which
2493
     * are now going to be modified directly on disk, bypassing the cache.
2494
     * qcow2_cache_empty() does both for us. */
2495
    ret = qcow2_cache_empty(bs, s->l2_table_cache);
2496
    if (ret < 0) {
2497
        goto fail;
2498
    }
2499

2500
    for (i = 0; i < s->nb_snapshots; i++) {
2501
        int l1_size2;
2502
        uint64_t *new_l1_table;
2503
        Error *local_err = NULL;
2504

2505
        ret = qcow2_validate_table(bs, s->snapshots[i].l1_table_offset,
2506
                                   s->snapshots[i].l1_size, L1E_SIZE,
2507
                                   QCOW_MAX_L1_SIZE, "Snapshot L1 table",
2508
                                   &local_err);
2509
        if (ret < 0) {
2510
            error_report_err(local_err);
2511
            goto fail;
2512
        }
2513

2514
        l1_size2 = s->snapshots[i].l1_size * L1E_SIZE;
2515
        new_l1_table = g_try_realloc(l1_table, l1_size2);
2516

2517
        if (!new_l1_table) {
2518
            ret = -ENOMEM;
2519
            goto fail;
2520
        }
2521

2522
        l1_table = new_l1_table;
2523

2524
        ret = bdrv_pread(bs->file, s->snapshots[i].l1_table_offset, l1_size2,
2525
                         l1_table, 0);
2526
        if (ret < 0) {
2527
            goto fail;
2528
        }
2529

2530
        for (j = 0; j < s->snapshots[i].l1_size; j++) {
2531
            be64_to_cpus(&l1_table[j]);
2532
        }
2533

2534
        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
2535
                                         &visited_l1_entries, l1_entries,
2536
                                         status_cb, cb_opaque);
2537
        if (ret < 0) {
2538
            goto fail;
2539
        }
2540
    }
2541

2542
    ret = 0;
2543

2544
fail:
2545
    g_free(l1_table);
2546
    return ret;
2547
}
2548

2549
void qcow2_parse_compressed_l2_entry(BlockDriverState *bs, uint64_t l2_entry,
2550
                                     uint64_t *coffset, int *csize)
2551
{
2552
    BDRVQcow2State *s = bs->opaque;
2553
    int nb_csectors;
2554

2555
    assert(qcow2_get_cluster_type(bs, l2_entry) == QCOW2_CLUSTER_COMPRESSED);
2556

2557
    *coffset = l2_entry & s->cluster_offset_mask;
2558

2559
    nb_csectors = ((l2_entry >> s->csize_shift) & s->csize_mask) + 1;
2560
    *csize = nb_csectors * QCOW2_COMPRESSED_SECTOR_SIZE -
2561
        (*coffset & (QCOW2_COMPRESSED_SECTOR_SIZE - 1));
2562
}
2563

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.