qemu

qed.c
1681 строка · 48.3 Кб
Перенос по словам
1
/*
2
 * QEMU Enhanced Disk Format
3
 *
4
 * Copyright IBM, Corp. 2010
5
 *
6
 * Authors:
7
 *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
8
 *  Anthony Liguori   <aliguori@us.ibm.com>
9
 *
10
 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
11
 * See the COPYING.LIB file in the top-level directory.
12
 *
13
 */
14

15
#include "qemu/osdep.h"
16
#include "block/qdict.h"
17
#include "qapi/error.h"
18
#include "qemu/timer.h"
19
#include "qemu/bswap.h"
20
#include "qemu/main-loop.h"
21
#include "qemu/module.h"
22
#include "qemu/option.h"
23
#include "qemu/memalign.h"
24
#include "trace.h"
25
#include "qed.h"
26
#include "sysemu/block-backend.h"
27
#include "qapi/qmp/qdict.h"
28
#include "qapi/qobject-input-visitor.h"
29
#include "qapi/qapi-visit-block-core.h"
30

31
static QemuOptsList qed_create_opts;
32

33
static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
34
                          const char *filename)
35
{
36
    const QEDHeader *header = (const QEDHeader *)buf;
37

38
    if (buf_size < sizeof(*header)) {
39
        return 0;
40
    }
41
    if (le32_to_cpu(header->magic) != QED_MAGIC) {
42
        return 0;
43
    }
44
    return 100;
45
}
46

47
/**
48
 * Check whether an image format is raw
49
 *
50
 * @fmt:    Backing file format, may be NULL
51
 */
52
static bool qed_fmt_is_raw(const char *fmt)
53
{
54
    return fmt && strcmp(fmt, "raw") == 0;
55
}
56

57
static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu)
58
{
59
    cpu->magic = le32_to_cpu(le->magic);
60
    cpu->cluster_size = le32_to_cpu(le->cluster_size);
61
    cpu->table_size = le32_to_cpu(le->table_size);
62
    cpu->header_size = le32_to_cpu(le->header_size);
63
    cpu->features = le64_to_cpu(le->features);
64
    cpu->compat_features = le64_to_cpu(le->compat_features);
65
    cpu->autoclear_features = le64_to_cpu(le->autoclear_features);
66
    cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset);
67
    cpu->image_size = le64_to_cpu(le->image_size);
68
    cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset);
69
    cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size);
70
}
71

72
static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le)
73
{
74
    le->magic = cpu_to_le32(cpu->magic);
75
    le->cluster_size = cpu_to_le32(cpu->cluster_size);
76
    le->table_size = cpu_to_le32(cpu->table_size);
77
    le->header_size = cpu_to_le32(cpu->header_size);
78
    le->features = cpu_to_le64(cpu->features);
79
    le->compat_features = cpu_to_le64(cpu->compat_features);
80
    le->autoclear_features = cpu_to_le64(cpu->autoclear_features);
81
    le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset);
82
    le->image_size = cpu_to_le64(cpu->image_size);
83
    le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset);
84
    le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size);
85
}
86

87
int qed_write_header_sync(BDRVQEDState *s)
88
{
89
    QEDHeader le;
90

91
    qed_header_cpu_to_le(&s->header, &le);
92
    return bdrv_pwrite(s->bs->file, 0, sizeof(le), &le, 0);
93
}
94

95
/**
96
 * Update header in-place (does not rewrite backing filename or other strings)
97
 *
98
 * This function only updates known header fields in-place and does not affect
99
 * extra data after the QED header.
100
 *
101
 * No new allocating reqs can start while this function runs.
102
 */
103
static int coroutine_fn GRAPH_RDLOCK qed_write_header(BDRVQEDState *s)
104
{
105
    /* We must write full sectors for O_DIRECT but cannot necessarily generate
106
     * the data following the header if an unrecognized compat feature is
107
     * active.  Therefore, first read the sectors containing the header, update
108
     * them, and write back.
109
     */
110

111
    int nsectors = DIV_ROUND_UP(sizeof(QEDHeader), BDRV_SECTOR_SIZE);
112
    size_t len = nsectors * BDRV_SECTOR_SIZE;
113
    uint8_t *buf;
114
    int ret;
115

116
    assert(s->allocating_acb || s->allocating_write_reqs_plugged);
117

118
    buf = qemu_blockalign(s->bs, len);
119

120
    ret = bdrv_co_pread(s->bs->file, 0, len, buf, 0);
121
    if (ret < 0) {
122
        goto out;
123
    }
124

125
    /* Update header */
126
    qed_header_cpu_to_le(&s->header, (QEDHeader *) buf);
127

128
    ret = bdrv_co_pwrite(s->bs->file, 0, len,  buf, 0);
129
    if (ret < 0) {
130
        goto out;
131
    }
132

133
    ret = 0;
134
out:
135
    qemu_vfree(buf);
136
    return ret;
137
}
138

139
static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
140
{
141
    uint64_t table_entries;
142
    uint64_t l2_size;
143

144
    table_entries = (table_size * cluster_size) / sizeof(uint64_t);
145
    l2_size = table_entries * cluster_size;
146

147
    return l2_size * table_entries;
148
}
149

150
static bool qed_is_cluster_size_valid(uint32_t cluster_size)
151
{
152
    if (cluster_size < QED_MIN_CLUSTER_SIZE ||
153
        cluster_size > QED_MAX_CLUSTER_SIZE) {
154
        return false;
155
    }
156
    if (cluster_size & (cluster_size - 1)) {
157
        return false; /* not power of 2 */
158
    }
159
    return true;
160
}
161

162
static bool qed_is_table_size_valid(uint32_t table_size)
163
{
164
    if (table_size < QED_MIN_TABLE_SIZE ||
165
        table_size > QED_MAX_TABLE_SIZE) {
166
        return false;
167
    }
168
    if (table_size & (table_size - 1)) {
169
        return false; /* not power of 2 */
170
    }
171
    return true;
172
}
173

174
static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size,
175
                                    uint32_t table_size)
176
{
177
    if (image_size % BDRV_SECTOR_SIZE != 0) {
178
        return false; /* not multiple of sector size */
179
    }
180
    if (image_size > qed_max_image_size(cluster_size, table_size)) {
181
        return false; /* image is too large */
182
    }
183
    return true;
184
}
185

186
/**
187
 * Read a string of known length from the image file
188
 *
189
 * @file:       Image file
190
 * @offset:     File offset to start of string, in bytes
191
 * @n:          String length in bytes
192
 * @buf:        Destination buffer
193
 * @buflen:     Destination buffer length in bytes
194
 * @ret:        0 on success, -errno on failure
195
 *
196
 * The string is NUL-terminated.
197
 */
198
static int coroutine_fn GRAPH_RDLOCK
199
qed_read_string(BdrvChild *file, uint64_t offset,
200
                size_t n, char *buf, size_t buflen)
201
{
202
    int ret;
203
    if (n >= buflen) {
204
        return -EINVAL;
205
    }
206
    ret = bdrv_co_pread(file, offset, n, buf, 0);
207
    if (ret < 0) {
208
        return ret;
209
    }
210
    buf[n] = '\0';
211
    return 0;
212
}
213

214
/**
215
 * Allocate new clusters
216
 *
217
 * @s:          QED state
218
 * @n:          Number of contiguous clusters to allocate
219
 * @ret:        Offset of first allocated cluster
220
 *
221
 * This function only produces the offset where the new clusters should be
222
 * written.  It updates BDRVQEDState but does not make any changes to the image
223
 * file.
224
 *
225
 * Called with table_lock held.
226
 */
227
static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n)
228
{
229
    uint64_t offset = s->file_size;
230
    s->file_size += n * s->header.cluster_size;
231
    return offset;
232
}
233

234
QEDTable *qed_alloc_table(BDRVQEDState *s)
235
{
236
    /* Honor O_DIRECT memory alignment requirements */
237
    return qemu_blockalign(s->bs,
238
                           s->header.cluster_size * s->header.table_size);
239
}
240

241
/**
242
 * Allocate a new zeroed L2 table
243
 *
244
 * Called with table_lock held.
245
 */
246
static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
247
{
248
    CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache);
249

250
    l2_table->table = qed_alloc_table(s);
251
    l2_table->offset = qed_alloc_clusters(s, s->header.table_size);
252

253
    memset(l2_table->table->offsets, 0,
254
           s->header.cluster_size * s->header.table_size);
255
    return l2_table;
256
}
257

258
static bool coroutine_fn qed_plug_allocating_write_reqs(BDRVQEDState *s)
259
{
260
    qemu_co_mutex_lock(&s->table_lock);
261

262
    /* No reentrancy is allowed.  */
263
    assert(!s->allocating_write_reqs_plugged);
264
    if (s->allocating_acb != NULL) {
265
        /* Another allocating write came concurrently.  This cannot happen
266
         * from bdrv_qed_drain_begin, but it can happen when the timer runs.
267
         */
268
        qemu_co_mutex_unlock(&s->table_lock);
269
        return false;
270
    }
271

272
    s->allocating_write_reqs_plugged = true;
273
    qemu_co_mutex_unlock(&s->table_lock);
274
    return true;
275
}
276

277
static void coroutine_fn qed_unplug_allocating_write_reqs(BDRVQEDState *s)
278
{
279
    qemu_co_mutex_lock(&s->table_lock);
280
    assert(s->allocating_write_reqs_plugged);
281
    s->allocating_write_reqs_plugged = false;
282
    qemu_co_queue_next(&s->allocating_write_reqs);
283
    qemu_co_mutex_unlock(&s->table_lock);
284
}
285

286
static void coroutine_fn GRAPH_RDLOCK qed_need_check_timer(BDRVQEDState *s)
287
{
288
    int ret;
289

290
    trace_qed_need_check_timer_cb(s);
291
    assert_bdrv_graph_readable();
292

293
    if (!qed_plug_allocating_write_reqs(s)) {
294
        return;
295
    }
296

297
    /* Ensure writes are on disk before clearing flag */
298
    ret = bdrv_co_flush(s->bs->file->bs);
299
    if (ret < 0) {
300
        qed_unplug_allocating_write_reqs(s);
301
        return;
302
    }
303

304
    s->header.features &= ~QED_F_NEED_CHECK;
305
    ret = qed_write_header(s);
306
    (void) ret;
307

308
    qed_unplug_allocating_write_reqs(s);
309

310
    ret = bdrv_co_flush(s->bs);
311
    (void) ret;
312
}
313

314
static void coroutine_fn qed_need_check_timer_entry(void *opaque)
315
{
316
    BDRVQEDState *s = opaque;
317
    GRAPH_RDLOCK_GUARD();
318

319
    qed_need_check_timer(opaque);
320
    bdrv_dec_in_flight(s->bs);
321
}
322

323
static void qed_need_check_timer_cb(void *opaque)
324
{
325
    BDRVQEDState *s = opaque;
326
    Coroutine *co = qemu_coroutine_create(qed_need_check_timer_entry, opaque);
327

328
    bdrv_inc_in_flight(s->bs);
329
    qemu_coroutine_enter(co);
330
}
331

332
static void qed_start_need_check_timer(BDRVQEDState *s)
333
{
334
    trace_qed_start_need_check_timer(s);
335

336
    /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for
337
     * migration.
338
     */
339
    timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
340
                   NANOSECONDS_PER_SECOND * QED_NEED_CHECK_TIMEOUT);
341
}
342

343
/* It's okay to call this multiple times or when no timer is started */
344
static void qed_cancel_need_check_timer(BDRVQEDState *s)
345
{
346
    trace_qed_cancel_need_check_timer(s);
347
    timer_del(s->need_check_timer);
348
}
349

350
static void bdrv_qed_detach_aio_context(BlockDriverState *bs)
351
{
352
    BDRVQEDState *s = bs->opaque;
353

354
    qed_cancel_need_check_timer(s);
355
    timer_free(s->need_check_timer);
356
}
357

358
static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
359
                                        AioContext *new_context)
360
{
361
    BDRVQEDState *s = bs->opaque;
362

363
    s->need_check_timer = aio_timer_new(new_context,
364
                                        QEMU_CLOCK_VIRTUAL, SCALE_NS,
365
                                        qed_need_check_timer_cb, s);
366
    if (s->header.features & QED_F_NEED_CHECK) {
367
        qed_start_need_check_timer(s);
368
    }
369
}
370

371
static void bdrv_qed_drain_begin(BlockDriverState *bs)
372
{
373
    BDRVQEDState *s = bs->opaque;
374

375
    /* Fire the timer immediately in order to start doing I/O as soon as the
376
     * header is flushed.
377
     */
378
    if (s->need_check_timer && timer_pending(s->need_check_timer)) {
379
        Coroutine *co;
380

381
        qed_cancel_need_check_timer(s);
382
        co = qemu_coroutine_create(qed_need_check_timer_entry, s);
383
        bdrv_inc_in_flight(bs);
384
        aio_co_enter(bdrv_get_aio_context(bs), co);
385
    }
386
}
387

388
static void bdrv_qed_init_state(BlockDriverState *bs)
389
{
390
    BDRVQEDState *s = bs->opaque;
391

392
    memset(s, 0, sizeof(BDRVQEDState));
393
    s->bs = bs;
394
    qemu_co_mutex_init(&s->table_lock);
395
    qemu_co_queue_init(&s->allocating_write_reqs);
396
}
397

398
/* Called with table_lock held.  */
399
static int coroutine_fn GRAPH_RDLOCK
400
bdrv_qed_do_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
401
{
402
    BDRVQEDState *s = bs->opaque;
403
    QEDHeader le_header;
404
    int64_t file_size;
405
    int ret;
406

407
    ret = bdrv_co_pread(bs->file, 0, sizeof(le_header), &le_header, 0);
408
    if (ret < 0) {
409
        error_setg(errp, "Failed to read QED header");
410
        return ret;
411
    }
412
    qed_header_le_to_cpu(&le_header, &s->header);
413

414
    if (s->header.magic != QED_MAGIC) {
415
        error_setg(errp, "Image not in QED format");
416
        return -EINVAL;
417
    }
418
    if (s->header.features & ~QED_FEATURE_MASK) {
419
        /* image uses unsupported feature bits */
420
        error_setg(errp, "Unsupported QED features: %" PRIx64,
421
                   s->header.features & ~QED_FEATURE_MASK);
422
        return -ENOTSUP;
423
    }
424
    if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
425
        error_setg(errp, "QED cluster size is invalid");
426
        return -EINVAL;
427
    }
428

429
    /* Round down file size to the last cluster */
430
    file_size = bdrv_co_getlength(bs->file->bs);
431
    if (file_size < 0) {
432
        error_setg(errp, "Failed to get file length");
433
        return file_size;
434
    }
435
    s->file_size = qed_start_of_cluster(s, file_size);
436

437
    if (!qed_is_table_size_valid(s->header.table_size)) {
438
        error_setg(errp, "QED table size is invalid");
439
        return -EINVAL;
440
    }
441
    if (!qed_is_image_size_valid(s->header.image_size,
442
                                 s->header.cluster_size,
443
                                 s->header.table_size)) {
444
        error_setg(errp, "QED image size is invalid");
445
        return -EINVAL;
446
    }
447
    if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
448
        error_setg(errp, "QED table offset is invalid");
449
        return -EINVAL;
450
    }
451

452
    s->table_nelems = (s->header.cluster_size * s->header.table_size) /
453
                      sizeof(uint64_t);
454
    s->l2_shift = ctz32(s->header.cluster_size);
455
    s->l2_mask = s->table_nelems - 1;
456
    s->l1_shift = s->l2_shift + ctz32(s->table_nelems);
457

458
    /* Header size calculation must not overflow uint32_t */
459
    if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
460
        error_setg(errp, "QED header size is too large");
461
        return -EINVAL;
462
    }
463

464
    if ((s->header.features & QED_F_BACKING_FILE)) {
465
        g_autofree char *backing_file_str = NULL;
466

467
        if ((uint64_t)s->header.backing_filename_offset +
468
            s->header.backing_filename_size >
469
            s->header.cluster_size * s->header.header_size) {
470
            error_setg(errp, "QED backing filename offset is invalid");
471
            return -EINVAL;
472
        }
473

474
        backing_file_str = g_malloc(sizeof(bs->backing_file));
475
        ret = qed_read_string(bs->file, s->header.backing_filename_offset,
476
                              s->header.backing_filename_size,
477
                              backing_file_str, sizeof(bs->backing_file));
478
        if (ret < 0) {
479
            error_setg(errp, "Failed to read backing filename");
480
            return ret;
481
        }
482

483
        if (!g_str_equal(backing_file_str, bs->backing_file)) {
484
            pstrcpy(bs->backing_file, sizeof(bs->backing_file),
485
                    backing_file_str);
486
            pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
487
                    backing_file_str);
488
        }
489

490
        if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) {
491
            pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw");
492
        }
493
    }
494

495
    /* Reset unknown autoclear feature bits.  This is a backwards
496
     * compatibility mechanism that allows images to be opened by older
497
     * programs, which "knock out" unknown feature bits.  When an image is
498
     * opened by a newer program again it can detect that the autoclear
499
     * feature is no longer valid.
500
     */
501
    if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
502
        !bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INACTIVE)) {
503
        s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
504

505
        ret = qed_write_header_sync(s);
506
        if (ret) {
507
            error_setg(errp, "Failed to update header");
508
            return ret;
509
        }
510

511
        /* From here on only known autoclear feature bits are valid */
512
        bdrv_co_flush(bs->file->bs);
513
    }
514

515
    s->l1_table = qed_alloc_table(s);
516
    qed_init_l2_cache(&s->l2_cache);
517

518
    ret = qed_read_l1_table_sync(s);
519
    if (ret) {
520
        error_setg(errp, "Failed to read L1 table");
521
        goto out;
522
    }
523

524
    /* If image was not closed cleanly, check consistency */
525
    if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) {
526
        /* Read-only images cannot be fixed.  There is no risk of corruption
527
         * since write operations are not possible.  Therefore, allow
528
         * potentially inconsistent images to be opened read-only.  This can
529
         * aid data recovery from an otherwise inconsistent image.
530
         */
531
        if (!bdrv_is_read_only(bs->file->bs) &&
532
            !(flags & BDRV_O_INACTIVE)) {
533
            BdrvCheckResult result = {0};
534

535
            ret = qed_check(s, &result, true);
536
            if (ret) {
537
                error_setg(errp, "Image corrupted");
538
                goto out;
539
            }
540
        }
541
    }
542

543
    bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs));
544

545
out:
546
    if (ret) {
547
        qed_free_l2_cache(&s->l2_cache);
548
        qemu_vfree(s->l1_table);
549
    }
550
    return ret;
551
}
552

553
typedef struct QEDOpenCo {
554
    BlockDriverState *bs;
555
    QDict *options;
556
    int flags;
557
    Error **errp;
558
    int ret;
559
} QEDOpenCo;
560

561
static void coroutine_fn bdrv_qed_open_entry(void *opaque)
562
{
563
    QEDOpenCo *qoc = opaque;
564
    BDRVQEDState *s = qoc->bs->opaque;
565

566
    GRAPH_RDLOCK_GUARD();
567

568
    qemu_co_mutex_lock(&s->table_lock);
569
    qoc->ret = bdrv_qed_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
570
    qemu_co_mutex_unlock(&s->table_lock);
571
}
572

573
static int coroutine_mixed_fn bdrv_qed_open(BlockDriverState *bs, QDict *options,
574
                                            int flags, Error **errp)
575
{
576
    QEDOpenCo qoc = {
577
        .bs = bs,
578
        .options = options,
579
        .flags = flags,
580
        .errp = errp,
581
        .ret = -EINPROGRESS
582
    };
583
    int ret;
584

585
    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
586
    if (ret < 0) {
587
        return ret;
588
    }
589

590
    bdrv_qed_init_state(bs);
591
    assert(!qemu_in_coroutine());
592
    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
593
    qemu_coroutine_enter(qemu_coroutine_create(bdrv_qed_open_entry, &qoc));
594
    BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
595

596
    return qoc.ret;
597
}
598

599
static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp)
600
{
601
    BDRVQEDState *s = bs->opaque;
602

603
    bs->bl.pwrite_zeroes_alignment = s->header.cluster_size;
604
    bs->bl.max_pwrite_zeroes = QEMU_ALIGN_DOWN(INT_MAX, s->header.cluster_size);
605
}
606

607
/* We have nothing to do for QED reopen, stubs just return
608
 * success */
609
static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
610
                                   BlockReopenQueue *queue, Error **errp)
611
{
612
    return 0;
613
}
614

615
static void GRAPH_RDLOCK bdrv_qed_do_close(BlockDriverState *bs)
616
{
617
    BDRVQEDState *s = bs->opaque;
618

619
    bdrv_qed_detach_aio_context(bs);
620

621
    /* Ensure writes reach stable storage */
622
    bdrv_flush(bs->file->bs);
623

624
    /* Clean shutdown, no check required on next open */
625
    if (s->header.features & QED_F_NEED_CHECK) {
626
        s->header.features &= ~QED_F_NEED_CHECK;
627
        qed_write_header_sync(s);
628
    }
629

630
    qed_free_l2_cache(&s->l2_cache);
631
    qemu_vfree(s->l1_table);
632
}
633

634
static void GRAPH_UNLOCKED bdrv_qed_close(BlockDriverState *bs)
635
{
636
    GLOBAL_STATE_CODE();
637
    GRAPH_RDLOCK_GUARD_MAINLOOP();
638

639
    bdrv_qed_do_close(bs);
640
}
641

642
static int coroutine_fn GRAPH_UNLOCKED
643
bdrv_qed_co_create(BlockdevCreateOptions *opts, Error **errp)
644
{
645
    BlockdevCreateOptionsQed *qed_opts;
646
    BlockBackend *blk = NULL;
647
    BlockDriverState *bs = NULL;
648

649
    QEDHeader header;
650
    QEDHeader le_header;
651
    uint8_t *l1_table = NULL;
652
    size_t l1_size;
653
    int ret = 0;
654

655
    assert(opts->driver == BLOCKDEV_DRIVER_QED);
656
    qed_opts = &opts->u.qed;
657

658
    /* Validate options and set default values */
659
    if (!qed_opts->has_cluster_size) {
660
        qed_opts->cluster_size = QED_DEFAULT_CLUSTER_SIZE;
661
    }
662
    if (!qed_opts->has_table_size) {
663
        qed_opts->table_size = QED_DEFAULT_TABLE_SIZE;
664
    }
665

666
    if (!qed_is_cluster_size_valid(qed_opts->cluster_size)) {
667
        error_setg(errp, "QED cluster size must be within range [%u, %u] "
668
                         "and power of 2",
669
                   QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE);
670
        return -EINVAL;
671
    }
672
    if (!qed_is_table_size_valid(qed_opts->table_size)) {
673
        error_setg(errp, "QED table size must be within range [%u, %u] "
674
                         "and power of 2",
675
                   QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE);
676
        return -EINVAL;
677
    }
678
    if (!qed_is_image_size_valid(qed_opts->size, qed_opts->cluster_size,
679
                                 qed_opts->table_size))
680
    {
681
        error_setg(errp, "QED image size must be a non-zero multiple of "
682
                         "cluster size and less than %" PRIu64 " bytes",
683
                   qed_max_image_size(qed_opts->cluster_size,
684
                                      qed_opts->table_size));
685
        return -EINVAL;
686
    }
687

688
    /* Create BlockBackend to write to the image */
689
    bs = bdrv_co_open_blockdev_ref(qed_opts->file, errp);
690
    if (bs == NULL) {
691
        return -EIO;
692
    }
693

694
    blk = blk_co_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
695
                             errp);
696
    if (!blk) {
697
        ret = -EPERM;
698
        goto out;
699
    }
700
    blk_set_allow_write_beyond_eof(blk, true);
701

702
    /* Prepare image format */
703
    header = (QEDHeader) {
704
        .magic = QED_MAGIC,
705
        .cluster_size = qed_opts->cluster_size,
706
        .table_size = qed_opts->table_size,
707
        .header_size = 1,
708
        .features = 0,
709
        .compat_features = 0,
710
        .l1_table_offset = qed_opts->cluster_size,
711
        .image_size = qed_opts->size,
712
    };
713

714
    l1_size = header.cluster_size * header.table_size;
715

716
    /*
717
     * The QED format associates file length with allocation status,
718
     * so a new file (which is empty) must have a length of 0.
719
     */
720
    ret = blk_co_truncate(blk, 0, true, PREALLOC_MODE_OFF, 0, errp);
721
    if (ret < 0) {
722
        goto out;
723
    }
724

725
    if (qed_opts->backing_file) {
726
        header.features |= QED_F_BACKING_FILE;
727
        header.backing_filename_offset = sizeof(le_header);
728
        header.backing_filename_size = strlen(qed_opts->backing_file);
729

730
        if (qed_opts->has_backing_fmt) {
731
            const char *backing_fmt = BlockdevDriver_str(qed_opts->backing_fmt);
732
            if (qed_fmt_is_raw(backing_fmt)) {
733
                header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
734
            }
735
        }
736
    }
737

738
    qed_header_cpu_to_le(&header, &le_header);
739
    ret = blk_co_pwrite(blk, 0, sizeof(le_header), &le_header, 0);
740
    if (ret < 0) {
741
        goto out;
742
    }
743
    ret = blk_co_pwrite(blk, sizeof(le_header), header.backing_filename_size,
744
                     qed_opts->backing_file, 0);
745
    if (ret < 0) {
746
        goto out;
747
    }
748

749
    l1_table = g_malloc0(l1_size);
750
    ret = blk_co_pwrite(blk, header.l1_table_offset, l1_size, l1_table, 0);
751
    if (ret < 0) {
752
        goto out;
753
    }
754

755
    ret = 0; /* success */
756
out:
757
    g_free(l1_table);
758
    blk_co_unref(blk);
759
    bdrv_co_unref(bs);
760
    return ret;
761
}
762

763
static int coroutine_fn GRAPH_UNLOCKED
764
bdrv_qed_co_create_opts(BlockDriver *drv, const char *filename,
765
                        QemuOpts *opts, Error **errp)
766
{
767
    BlockdevCreateOptions *create_options = NULL;
768
    QDict *qdict;
769
    Visitor *v;
770
    BlockDriverState *bs = NULL;
771
    int ret;
772

773
    static const QDictRenames opt_renames[] = {
774
        { BLOCK_OPT_BACKING_FILE,       "backing-file" },
775
        { BLOCK_OPT_BACKING_FMT,        "backing-fmt" },
776
        { BLOCK_OPT_CLUSTER_SIZE,       "cluster-size" },
777
        { BLOCK_OPT_TABLE_SIZE,         "table-size" },
778
        { NULL, NULL },
779
    };
780

781
    /* Parse options and convert legacy syntax */
782
    qdict = qemu_opts_to_qdict_filtered(opts, NULL, &qed_create_opts, true);
783

784
    if (!qdict_rename_keys(qdict, opt_renames, errp)) {
785
        ret = -EINVAL;
786
        goto fail;
787
    }
788

789
    /* Create and open the file (protocol layer) */
790
    ret = bdrv_co_create_file(filename, opts, errp);
791
    if (ret < 0) {
792
        goto fail;
793
    }
794

795
    bs = bdrv_co_open(filename, NULL, NULL,
796
                      BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
797
    if (bs == NULL) {
798
        ret = -EIO;
799
        goto fail;
800
    }
801

802
    /* Now get the QAPI type BlockdevCreateOptions */
803
    qdict_put_str(qdict, "driver", "qed");
804
    qdict_put_str(qdict, "file", bs->node_name);
805

806
    v = qobject_input_visitor_new_flat_confused(qdict, errp);
807
    if (!v) {
808
        ret = -EINVAL;
809
        goto fail;
810
    }
811

812
    visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
813
    visit_free(v);
814
    if (!create_options) {
815
        ret = -EINVAL;
816
        goto fail;
817
    }
818

819
    /* Silently round up size */
820
    assert(create_options->driver == BLOCKDEV_DRIVER_QED);
821
    create_options->u.qed.size =
822
        ROUND_UP(create_options->u.qed.size, BDRV_SECTOR_SIZE);
823

824
    /* Create the qed image (format layer) */
825
    ret = bdrv_qed_co_create(create_options, errp);
826

827
fail:
828
    qobject_unref(qdict);
829
    bdrv_co_unref(bs);
830
    qapi_free_BlockdevCreateOptions(create_options);
831
    return ret;
832
}
833

834
static int coroutine_fn GRAPH_RDLOCK
835
bdrv_qed_co_block_status(BlockDriverState *bs, bool want_zero, int64_t pos,
836
                         int64_t bytes, int64_t *pnum, int64_t *map,
837
                         BlockDriverState **file)
838
{
839
    BDRVQEDState *s = bs->opaque;
840
    size_t len = MIN(bytes, SIZE_MAX);
841
    int status;
842
    QEDRequest request = { .l2_table = NULL };
843
    uint64_t offset;
844
    int ret;
845

846
    qemu_co_mutex_lock(&s->table_lock);
847
    ret = qed_find_cluster(s, &request, pos, &len, &offset);
848

849
    *pnum = len;
850
    switch (ret) {
851
    case QED_CLUSTER_FOUND:
852
        *map = offset | qed_offset_into_cluster(s, pos);
853
        status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
854
        *file = bs->file->bs;
855
        break;
856
    case QED_CLUSTER_ZERO:
857
        status = BDRV_BLOCK_ZERO;
858
        break;
859
    case QED_CLUSTER_L2:
860
    case QED_CLUSTER_L1:
861
        status = 0;
862
        break;
863
    default:
864
        assert(ret < 0);
865
        status = ret;
866
        break;
867
    }
868

869
    qed_unref_l2_cache_entry(request.l2_table);
870
    qemu_co_mutex_unlock(&s->table_lock);
871

872
    return status;
873
}
874

875
static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
876
{
877
    return acb->bs->opaque;
878
}
879

880
/**
881
 * Read from the backing file or zero-fill if no backing file
882
 *
883
 * @s:              QED state
884
 * @pos:            Byte position in device
885
 * @qiov:           Destination I/O vector
886
 *
887
 * This function reads qiov->size bytes starting at pos from the backing file.
888
 * If there is no backing file then zeroes are read.
889
 */
890
static int coroutine_fn GRAPH_RDLOCK
891
qed_read_backing_file(BDRVQEDState *s, uint64_t pos, QEMUIOVector *qiov)
892
{
893
    if (s->bs->backing) {
894
        BLKDBG_CO_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO);
895
        return bdrv_co_preadv(s->bs->backing, pos, qiov->size, qiov, 0);
896
    }
897
    qemu_iovec_memset(qiov, 0, 0, qiov->size);
898
    return 0;
899
}
900

901
/**
902
 * Copy data from backing file into the image
903
 *
904
 * @s:          QED state
905
 * @pos:        Byte position in device
906
 * @len:        Number of bytes
907
 * @offset:     Byte offset in image file
908
 */
909
static int coroutine_fn GRAPH_RDLOCK
910
qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, uint64_t len,
911
                           uint64_t offset)
912
{
913
    QEMUIOVector qiov;
914
    int ret;
915

916
    /* Skip copy entirely if there is no work to do */
917
    if (len == 0) {
918
        return 0;
919
    }
920

921
    qemu_iovec_init_buf(&qiov, qemu_blockalign(s->bs, len), len);
922

923
    ret = qed_read_backing_file(s, pos, &qiov);
924

925
    if (ret) {
926
        goto out;
927
    }
928

929
    BLKDBG_CO_EVENT(s->bs->file, BLKDBG_COW_WRITE);
930
    ret = bdrv_co_pwritev(s->bs->file, offset, qiov.size, &qiov, 0);
931
    if (ret < 0) {
932
        goto out;
933
    }
934
    ret = 0;
935
out:
936
    qemu_vfree(qemu_iovec_buf(&qiov));
937
    return ret;
938
}
939

940
/**
941
 * Link one or more contiguous clusters into a table
942
 *
943
 * @s:              QED state
944
 * @table:          L2 table
945
 * @index:          First cluster index
946
 * @n:              Number of contiguous clusters
947
 * @cluster:        First cluster offset
948
 *
949
 * The cluster offset may be an allocated byte offset in the image file, the
950
 * zero cluster marker, or the unallocated cluster marker.
951
 *
952
 * Called with table_lock held.
953
 */
954
static void coroutine_fn qed_update_l2_table(BDRVQEDState *s, QEDTable *table,
955
                                             int index, unsigned int n,
956
                                             uint64_t cluster)
957
{
958
    int i;
959
    for (i = index; i < index + n; i++) {
960
        table->offsets[i] = cluster;
961
        if (!qed_offset_is_unalloc_cluster(cluster) &&
962
            !qed_offset_is_zero_cluster(cluster)) {
963
            cluster += s->header.cluster_size;
964
        }
965
    }
966
}
967

968
/* Called with table_lock held.  */
969
static void coroutine_fn qed_aio_complete(QEDAIOCB *acb)
970
{
971
    BDRVQEDState *s = acb_to_s(acb);
972

973
    /* Free resources */
974
    qemu_iovec_destroy(&acb->cur_qiov);
975
    qed_unref_l2_cache_entry(acb->request.l2_table);
976

977
    /* Free the buffer we may have allocated for zero writes */
978
    if (acb->flags & QED_AIOCB_ZERO) {
979
        qemu_vfree(acb->qiov->iov[0].iov_base);
980
        acb->qiov->iov[0].iov_base = NULL;
981
    }
982

983
    /* Start next allocating write request waiting behind this one.  Note that
984
     * requests enqueue themselves when they first hit an unallocated cluster
985
     * but they wait until the entire request is finished before waking up the
986
     * next request in the queue.  This ensures that we don't cycle through
987
     * requests multiple times but rather finish one at a time completely.
988
     */
989
    if (acb == s->allocating_acb) {
990
        s->allocating_acb = NULL;
991
        if (!qemu_co_queue_empty(&s->allocating_write_reqs)) {
992
            qemu_co_queue_next(&s->allocating_write_reqs);
993
        } else if (s->header.features & QED_F_NEED_CHECK) {
994
            qed_start_need_check_timer(s);
995
        }
996
    }
997
}
998

999
/**
1000
 * Update L1 table with new L2 table offset and write it out
1001
 *
1002
 * Called with table_lock held.
1003
 */
1004
static int coroutine_fn GRAPH_RDLOCK qed_aio_write_l1_update(QEDAIOCB *acb)
1005
{
1006
    BDRVQEDState *s = acb_to_s(acb);
1007
    CachedL2Table *l2_table = acb->request.l2_table;
1008
    uint64_t l2_offset = l2_table->offset;
1009
    int index, ret;
1010

1011
    index = qed_l1_index(s, acb->cur_pos);
1012
    s->l1_table->offsets[index] = l2_table->offset;
1013

1014
    ret = qed_write_l1_table(s, index, 1);
1015

1016
    /* Commit the current L2 table to the cache */
1017
    qed_commit_l2_cache_entry(&s->l2_cache, l2_table);
1018

1019
    /* This is guaranteed to succeed because we just committed the entry to the
1020
     * cache.
1021
     */
1022
    acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
1023
    assert(acb->request.l2_table != NULL);
1024

1025
    return ret;
1026
}
1027

1028

1029
/**
1030
 * Update L2 table with new cluster offsets and write them out
1031
 *
1032
 * Called with table_lock held.
1033
 */
1034
static int coroutine_fn GRAPH_RDLOCK
1035
qed_aio_write_l2_update(QEDAIOCB *acb, uint64_t offset)
1036
{
1037
    BDRVQEDState *s = acb_to_s(acb);
1038
    bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
1039
    int index, ret;
1040

1041
    if (need_alloc) {
1042
        qed_unref_l2_cache_entry(acb->request.l2_table);
1043
        acb->request.l2_table = qed_new_l2_table(s);
1044
    }
1045

1046
    index = qed_l2_index(s, acb->cur_pos);
1047
    qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
1048
                         offset);
1049

1050
    if (need_alloc) {
1051
        /* Write out the whole new L2 table */
1052
        ret = qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true);
1053
        if (ret) {
1054
            return ret;
1055
        }
1056
        return qed_aio_write_l1_update(acb);
1057
    } else {
1058
        /* Write out only the updated part of the L2 table */
1059
        ret = qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters,
1060
                                 false);
1061
        if (ret) {
1062
            return ret;
1063
        }
1064
    }
1065
    return 0;
1066
}
1067

1068
/**
1069
 * Write data to the image file
1070
 *
1071
 * Called with table_lock *not* held.
1072
 */
1073
static int coroutine_fn GRAPH_RDLOCK qed_aio_write_main(QEDAIOCB *acb)
1074
{
1075
    BDRVQEDState *s = acb_to_s(acb);
1076
    uint64_t offset = acb->cur_cluster +
1077
                      qed_offset_into_cluster(s, acb->cur_pos);
1078

1079
    trace_qed_aio_write_main(s, acb, 0, offset, acb->cur_qiov.size);
1080

1081
    BLKDBG_CO_EVENT(s->bs->file, BLKDBG_WRITE_AIO);
1082
    return bdrv_co_pwritev(s->bs->file, offset, acb->cur_qiov.size,
1083
                           &acb->cur_qiov, 0);
1084
}
1085

1086
/**
1087
 * Populate untouched regions of new data cluster
1088
 *
1089
 * Called with table_lock held.
1090
 */
1091
static int coroutine_fn GRAPH_RDLOCK qed_aio_write_cow(QEDAIOCB *acb)
1092
{
1093
    BDRVQEDState *s = acb_to_s(acb);
1094
    uint64_t start, len, offset;
1095
    int ret;
1096

1097
    qemu_co_mutex_unlock(&s->table_lock);
1098

1099
    /* Populate front untouched region of new data cluster */
1100
    start = qed_start_of_cluster(s, acb->cur_pos);
1101
    len = qed_offset_into_cluster(s, acb->cur_pos);
1102

1103
    trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster);
1104
    ret = qed_copy_from_backing_file(s, start, len, acb->cur_cluster);
1105
    if (ret < 0) {
1106
        goto out;
1107
    }
1108

1109
    /* Populate back untouched region of new data cluster */
1110
    start = acb->cur_pos + acb->cur_qiov.size;
1111
    len = qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start;
1112
    offset = acb->cur_cluster +
1113
             qed_offset_into_cluster(s, acb->cur_pos) +
1114
             acb->cur_qiov.size;
1115

1116
    trace_qed_aio_write_postfill(s, acb, start, len, offset);
1117
    ret = qed_copy_from_backing_file(s, start, len, offset);
1118
    if (ret < 0) {
1119
        goto out;
1120
    }
1121

1122
    ret = qed_aio_write_main(acb);
1123
    if (ret < 0) {
1124
        goto out;
1125
    }
1126

1127
    if (s->bs->backing) {
1128
        /*
1129
         * Flush new data clusters before updating the L2 table
1130
         *
1131
         * This flush is necessary when a backing file is in use.  A crash
1132
         * during an allocating write could result in empty clusters in the
1133
         * image.  If the write only touched a subregion of the cluster,
1134
         * then backing image sectors have been lost in the untouched
1135
         * region.  The solution is to flush after writing a new data
1136
         * cluster and before updating the L2 table.
1137
         */
1138
        ret = bdrv_co_flush(s->bs->file->bs);
1139
    }
1140

1141
out:
1142
    qemu_co_mutex_lock(&s->table_lock);
1143
    return ret;
1144
}
1145

1146
/**
1147
 * Check if the QED_F_NEED_CHECK bit should be set during allocating write
1148
 */
1149
static bool GRAPH_RDLOCK qed_should_set_need_check(BDRVQEDState *s)
1150
{
1151
    /* The flush before L2 update path ensures consistency */
1152
    if (s->bs->backing) {
1153
        return false;
1154
    }
1155

1156
    return !(s->header.features & QED_F_NEED_CHECK);
1157
}
1158

1159
/**
1160
 * Write new data cluster
1161
 *
1162
 * @acb:        Write request
1163
 * @len:        Length in bytes
1164
 *
1165
 * This path is taken when writing to previously unallocated clusters.
1166
 *
1167
 * Called with table_lock held.
1168
 */
1169
static int coroutine_fn GRAPH_RDLOCK
1170
qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
1171
{
1172
    BDRVQEDState *s = acb_to_s(acb);
1173
    int ret;
1174

1175
    /* Cancel timer when the first allocating request comes in */
1176
    if (s->allocating_acb == NULL) {
1177
        qed_cancel_need_check_timer(s);
1178
    }
1179

1180
    /* Freeze this request if another allocating write is in progress */
1181
    if (s->allocating_acb != acb || s->allocating_write_reqs_plugged) {
1182
        if (s->allocating_acb != NULL) {
1183
            qemu_co_queue_wait(&s->allocating_write_reqs, &s->table_lock);
1184
            assert(s->allocating_acb == NULL);
1185
        }
1186
        s->allocating_acb = acb;
1187
        return -EAGAIN; /* start over with looking up table entries */
1188
    }
1189

1190
    acb->cur_nclusters = qed_bytes_to_clusters(s,
1191
            qed_offset_into_cluster(s, acb->cur_pos) + len);
1192
    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1193

1194
    if (acb->flags & QED_AIOCB_ZERO) {
1195
        /* Skip ahead if the clusters are already zero */
1196
        if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
1197
            return 0;
1198
        }
1199
        acb->cur_cluster = 1;
1200
    } else {
1201
        acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
1202
    }
1203

1204
    if (qed_should_set_need_check(s)) {
1205
        s->header.features |= QED_F_NEED_CHECK;
1206
        ret = qed_write_header(s);
1207
        if (ret < 0) {
1208
            return ret;
1209
        }
1210
    }
1211

1212
    if (!(acb->flags & QED_AIOCB_ZERO)) {
1213
        ret = qed_aio_write_cow(acb);
1214
        if (ret < 0) {
1215
            return ret;
1216
        }
1217
    }
1218

1219
    return qed_aio_write_l2_update(acb, acb->cur_cluster);
1220
}
1221

1222
/**
1223
 * Write data cluster in place
1224
 *
1225
 * @acb:        Write request
1226
 * @offset:     Cluster offset in bytes
1227
 * @len:        Length in bytes
1228
 *
1229
 * This path is taken when writing to already allocated clusters.
1230
 *
1231
 * Called with table_lock held.
1232
 */
1233
static int coroutine_fn GRAPH_RDLOCK
1234
qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
1235
{
1236
    BDRVQEDState *s = acb_to_s(acb);
1237
    int r;
1238

1239
    qemu_co_mutex_unlock(&s->table_lock);
1240

1241
    /* Allocate buffer for zero writes */
1242
    if (acb->flags & QED_AIOCB_ZERO) {
1243
        struct iovec *iov = acb->qiov->iov;
1244

1245
        if (!iov->iov_base) {
1246
            iov->iov_base = qemu_try_blockalign(acb->bs, iov->iov_len);
1247
            if (iov->iov_base == NULL) {
1248
                r = -ENOMEM;
1249
                goto out;
1250
            }
1251
            memset(iov->iov_base, 0, iov->iov_len);
1252
        }
1253
    }
1254

1255
    /* Calculate the I/O vector */
1256
    acb->cur_cluster = offset;
1257
    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1258

1259
    /* Do the actual write.  */
1260
    r = qed_aio_write_main(acb);
1261
out:
1262
    qemu_co_mutex_lock(&s->table_lock);
1263
    return r;
1264
}
1265

1266
/**
1267
 * Write data cluster
1268
 *
1269
 * @opaque:     Write request
1270
 * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
1271
 * @offset:     Cluster offset in bytes
1272
 * @len:        Length in bytes
1273
 *
1274
 * Called with table_lock held.
1275
 */
1276
static int coroutine_fn GRAPH_RDLOCK
1277
qed_aio_write_data(void *opaque, int ret, uint64_t offset, size_t len)
1278
{
1279
    QEDAIOCB *acb = opaque;
1280

1281
    trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
1282

1283
    acb->find_cluster_ret = ret;
1284

1285
    switch (ret) {
1286
    case QED_CLUSTER_FOUND:
1287
        return qed_aio_write_inplace(acb, offset, len);
1288

1289
    case QED_CLUSTER_L2:
1290
    case QED_CLUSTER_L1:
1291
    case QED_CLUSTER_ZERO:
1292
        return qed_aio_write_alloc(acb, len);
1293

1294
    default:
1295
        g_assert_not_reached();
1296
    }
1297
}
1298

1299
/**
1300
 * Read data cluster
1301
 *
1302
 * @opaque:     Read request
1303
 * @ret:        QED_CLUSTER_FOUND, QED_CLUSTER_L2 or QED_CLUSTER_L1
1304
 * @offset:     Cluster offset in bytes
1305
 * @len:        Length in bytes
1306
 *
1307
 * Called with table_lock held.
1308
 */
1309
static int coroutine_fn GRAPH_RDLOCK
1310
qed_aio_read_data(void *opaque, int ret, uint64_t offset, size_t len)
1311
{
1312
    QEDAIOCB *acb = opaque;
1313
    BDRVQEDState *s = acb_to_s(acb);
1314
    BlockDriverState *bs = acb->bs;
1315
    int r;
1316

1317
    qemu_co_mutex_unlock(&s->table_lock);
1318

1319
    /* Adjust offset into cluster */
1320
    offset += qed_offset_into_cluster(s, acb->cur_pos);
1321

1322
    trace_qed_aio_read_data(s, acb, ret, offset, len);
1323

1324
    qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
1325

1326
    /* Handle zero cluster and backing file reads, otherwise read
1327
     * data cluster directly.
1328
     */
1329
    if (ret == QED_CLUSTER_ZERO) {
1330
        qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
1331
        r = 0;
1332
    } else if (ret != QED_CLUSTER_FOUND) {
1333
        r = qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov);
1334
    } else {
1335
        BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO);
1336
        r = bdrv_co_preadv(bs->file, offset, acb->cur_qiov.size,
1337
                           &acb->cur_qiov, 0);
1338
    }
1339

1340
    qemu_co_mutex_lock(&s->table_lock);
1341
    return r;
1342
}
1343

1344
/**
1345
 * Begin next I/O or complete the request
1346
 */
1347
static int coroutine_fn GRAPH_RDLOCK qed_aio_next_io(QEDAIOCB *acb)
1348
{
1349
    BDRVQEDState *s = acb_to_s(acb);
1350
    uint64_t offset;
1351
    size_t len;
1352
    int ret;
1353

1354
    qemu_co_mutex_lock(&s->table_lock);
1355
    while (1) {
1356
        trace_qed_aio_next_io(s, acb, 0, acb->cur_pos + acb->cur_qiov.size);
1357

1358
        acb->qiov_offset += acb->cur_qiov.size;
1359
        acb->cur_pos += acb->cur_qiov.size;
1360
        qemu_iovec_reset(&acb->cur_qiov);
1361

1362
        /* Complete request */
1363
        if (acb->cur_pos >= acb->end_pos) {
1364
            ret = 0;
1365
            break;
1366
        }
1367

1368
        /* Find next cluster and start I/O */
1369
        len = acb->end_pos - acb->cur_pos;
1370
        ret = qed_find_cluster(s, &acb->request, acb->cur_pos, &len, &offset);
1371
        if (ret < 0) {
1372
            break;
1373
        }
1374

1375
        if (acb->flags & QED_AIOCB_WRITE) {
1376
            ret = qed_aio_write_data(acb, ret, offset, len);
1377
        } else {
1378
            ret = qed_aio_read_data(acb, ret, offset, len);
1379
        }
1380

1381
        if (ret < 0 && ret != -EAGAIN) {
1382
            break;
1383
        }
1384
    }
1385

1386
    trace_qed_aio_complete(s, acb, ret);
1387
    qed_aio_complete(acb);
1388
    qemu_co_mutex_unlock(&s->table_lock);
1389
    return ret;
1390
}
1391

1392
static int coroutine_fn GRAPH_RDLOCK
1393
qed_co_request(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov,
1394
               int nb_sectors, int flags)
1395
{
1396
    QEDAIOCB acb = {
1397
        .bs         = bs,
1398
        .cur_pos    = (uint64_t) sector_num * BDRV_SECTOR_SIZE,
1399
        .end_pos    = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE,
1400
        .qiov       = qiov,
1401
        .flags      = flags,
1402
    };
1403
    qemu_iovec_init(&acb.cur_qiov, qiov->niov);
1404

1405
    trace_qed_aio_setup(bs->opaque, &acb, sector_num, nb_sectors, NULL, flags);
1406

1407
    /* Start request */
1408
    return qed_aio_next_io(&acb);
1409
}
1410

1411
static int coroutine_fn GRAPH_RDLOCK
1412
bdrv_qed_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1413
                  QEMUIOVector *qiov)
1414
{
1415
    return qed_co_request(bs, sector_num, qiov, nb_sectors, 0);
1416
}
1417

1418
static int coroutine_fn GRAPH_RDLOCK
1419
bdrv_qed_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
1420
                   QEMUIOVector *qiov, int flags)
1421
{
1422
    return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
1423
}
1424

1425
static int coroutine_fn GRAPH_RDLOCK
1426
bdrv_qed_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
1427
                          BdrvRequestFlags flags)
1428
{
1429
    BDRVQEDState *s = bs->opaque;
1430

1431
    /*
1432
     * Zero writes start without an I/O buffer.  If a buffer becomes necessary
1433
     * then it will be allocated during request processing.
1434
     */
1435
    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
1436

1437
    /*
1438
     * QED is not prepared for 63bit write-zero requests, so rely on
1439
     * max_pwrite_zeroes.
1440
     */
1441
    assert(bytes <= INT_MAX);
1442

1443
    /* Fall back if the request is not aligned */
1444
    if (qed_offset_into_cluster(s, offset) ||
1445
        qed_offset_into_cluster(s, bytes)) {
1446
        return -ENOTSUP;
1447
    }
1448

1449
    return qed_co_request(bs, offset >> BDRV_SECTOR_BITS, &qiov,
1450
                          bytes >> BDRV_SECTOR_BITS,
1451
                          QED_AIOCB_WRITE | QED_AIOCB_ZERO);
1452
}
1453

1454
static int coroutine_fn GRAPH_RDLOCK
1455
bdrv_qed_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
1456
                     PreallocMode prealloc, BdrvRequestFlags flags,
1457
                     Error **errp)
1458
{
1459
    BDRVQEDState *s = bs->opaque;
1460
    uint64_t old_image_size;
1461
    int ret;
1462

1463
    if (prealloc != PREALLOC_MODE_OFF) {
1464
        error_setg(errp, "Unsupported preallocation mode '%s'",
1465
                   PreallocMode_str(prealloc));
1466
        return -ENOTSUP;
1467
    }
1468

1469
    if (!qed_is_image_size_valid(offset, s->header.cluster_size,
1470
                                 s->header.table_size)) {
1471
        error_setg(errp, "Invalid image size specified");
1472
        return -EINVAL;
1473
    }
1474

1475
    if ((uint64_t)offset < s->header.image_size) {
1476
        error_setg(errp, "Shrinking images is currently not supported");
1477
        return -ENOTSUP;
1478
    }
1479

1480
    old_image_size = s->header.image_size;
1481
    s->header.image_size = offset;
1482
    ret = qed_write_header_sync(s);
1483
    if (ret < 0) {
1484
        s->header.image_size = old_image_size;
1485
        error_setg_errno(errp, -ret, "Failed to update the image size");
1486
    }
1487
    return ret;
1488
}
1489

1490
static int64_t coroutine_fn bdrv_qed_co_getlength(BlockDriverState *bs)
1491
{
1492
    BDRVQEDState *s = bs->opaque;
1493
    return s->header.image_size;
1494
}
1495

1496
static int coroutine_fn
1497
bdrv_qed_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
1498
{
1499
    BDRVQEDState *s = bs->opaque;
1500

1501
    memset(bdi, 0, sizeof(*bdi));
1502
    bdi->cluster_size = s->header.cluster_size;
1503
    bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
1504
    return 0;
1505
}
1506

1507
static int coroutine_fn GRAPH_RDLOCK
1508
bdrv_qed_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
1509
                                const char *backing_fmt)
1510
{
1511
    BDRVQEDState *s = bs->opaque;
1512
    QEDHeader new_header, le_header;
1513
    void *buffer;
1514
    size_t buffer_len, backing_file_len;
1515
    int ret;
1516

1517
    /* Refuse to set backing filename if unknown compat feature bits are
1518
     * active.  If the image uses an unknown compat feature then we may not
1519
     * know the layout of data following the header structure and cannot safely
1520
     * add a new string.
1521
     */
1522
    if (backing_file && (s->header.compat_features &
1523
                         ~QED_COMPAT_FEATURE_MASK)) {
1524
        return -ENOTSUP;
1525
    }
1526

1527
    memcpy(&new_header, &s->header, sizeof(new_header));
1528

1529
    new_header.features &= ~(QED_F_BACKING_FILE |
1530
                             QED_F_BACKING_FORMAT_NO_PROBE);
1531

1532
    /* Adjust feature flags */
1533
    if (backing_file) {
1534
        new_header.features |= QED_F_BACKING_FILE;
1535

1536
        if (qed_fmt_is_raw(backing_fmt)) {
1537
            new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE;
1538
        }
1539
    }
1540

1541
    /* Calculate new header size */
1542
    backing_file_len = 0;
1543

1544
    if (backing_file) {
1545
        backing_file_len = strlen(backing_file);
1546
    }
1547

1548
    buffer_len = sizeof(new_header);
1549
    new_header.backing_filename_offset = buffer_len;
1550
    new_header.backing_filename_size = backing_file_len;
1551
    buffer_len += backing_file_len;
1552

1553
    /* Make sure we can rewrite header without failing */
1554
    if (buffer_len > new_header.header_size * new_header.cluster_size) {
1555
        return -ENOSPC;
1556
    }
1557

1558
    /* Prepare new header */
1559
    buffer = g_malloc(buffer_len);
1560

1561
    qed_header_cpu_to_le(&new_header, &le_header);
1562
    memcpy(buffer, &le_header, sizeof(le_header));
1563
    buffer_len = sizeof(le_header);
1564

1565
    if (backing_file) {
1566
        memcpy(buffer + buffer_len, backing_file, backing_file_len);
1567
        buffer_len += backing_file_len;
1568
    }
1569

1570
    /* Write new header */
1571
    ret = bdrv_co_pwrite_sync(bs->file, 0, buffer_len, buffer, 0);
1572
    g_free(buffer);
1573
    if (ret == 0) {
1574
        memcpy(&s->header, &new_header, sizeof(new_header));
1575
    }
1576
    return ret;
1577
}
1578

1579
static void coroutine_fn GRAPH_RDLOCK
1580
bdrv_qed_co_invalidate_cache(BlockDriverState *bs, Error **errp)
1581
{
1582
    ERRP_GUARD();
1583
    BDRVQEDState *s = bs->opaque;
1584
    int ret;
1585

1586
    bdrv_qed_do_close(bs);
1587

1588
    bdrv_qed_init_state(bs);
1589
    qemu_co_mutex_lock(&s->table_lock);
1590
    ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, errp);
1591
    qemu_co_mutex_unlock(&s->table_lock);
1592
    if (ret < 0) {
1593
        error_prepend(errp, "Could not reopen qed layer: ");
1594
    }
1595
}
1596

1597
static int coroutine_fn GRAPH_RDLOCK
1598
bdrv_qed_co_check(BlockDriverState *bs, BdrvCheckResult *result,
1599
                  BdrvCheckMode fix)
1600
{
1601
    BDRVQEDState *s = bs->opaque;
1602
    int ret;
1603

1604
    qemu_co_mutex_lock(&s->table_lock);
1605
    ret = qed_check(s, result, !!fix);
1606
    qemu_co_mutex_unlock(&s->table_lock);
1607

1608
    return ret;
1609
}
1610

1611
static QemuOptsList qed_create_opts = {
1612
    .name = "qed-create-opts",
1613
    .head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head),
1614
    .desc = {
1615
        {
1616
            .name = BLOCK_OPT_SIZE,
1617
            .type = QEMU_OPT_SIZE,
1618
            .help = "Virtual disk size"
1619
        },
1620
        {
1621
            .name = BLOCK_OPT_BACKING_FILE,
1622
            .type = QEMU_OPT_STRING,
1623
            .help = "File name of a base image"
1624
        },
1625
        {
1626
            .name = BLOCK_OPT_BACKING_FMT,
1627
            .type = QEMU_OPT_STRING,
1628
            .help = "Image format of the base image"
1629
        },
1630
        {
1631
            .name = BLOCK_OPT_CLUSTER_SIZE,
1632
            .type = QEMU_OPT_SIZE,
1633
            .help = "Cluster size (in bytes)",
1634
            .def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE)
1635
        },
1636
        {
1637
            .name = BLOCK_OPT_TABLE_SIZE,
1638
            .type = QEMU_OPT_SIZE,
1639
            .help = "L1/L2 table size (in clusters)"
1640
        },
1641
        { /* end of list */ }
1642
    }
1643
};
1644

1645
static BlockDriver bdrv_qed = {
1646
    .format_name                    = "qed",
1647
    .instance_size                  = sizeof(BDRVQEDState),
1648
    .create_opts                    = &qed_create_opts,
1649
    .is_format                      = true,
1650
    .supports_backing               = true,
1651

1652
    .bdrv_probe                     = bdrv_qed_probe,
1653
    .bdrv_open                      = bdrv_qed_open,
1654
    .bdrv_close                     = bdrv_qed_close,
1655
    .bdrv_reopen_prepare            = bdrv_qed_reopen_prepare,
1656
    .bdrv_child_perm                = bdrv_default_perms,
1657
    .bdrv_co_create                 = bdrv_qed_co_create,
1658
    .bdrv_co_create_opts            = bdrv_qed_co_create_opts,
1659
    .bdrv_has_zero_init             = bdrv_has_zero_init_1,
1660
    .bdrv_co_block_status           = bdrv_qed_co_block_status,
1661
    .bdrv_co_readv                  = bdrv_qed_co_readv,
1662
    .bdrv_co_writev                 = bdrv_qed_co_writev,
1663
    .bdrv_co_pwrite_zeroes          = bdrv_qed_co_pwrite_zeroes,
1664
    .bdrv_co_truncate               = bdrv_qed_co_truncate,
1665
    .bdrv_co_getlength              = bdrv_qed_co_getlength,
1666
    .bdrv_co_get_info               = bdrv_qed_co_get_info,
1667
    .bdrv_refresh_limits            = bdrv_qed_refresh_limits,
1668
    .bdrv_co_change_backing_file    = bdrv_qed_co_change_backing_file,
1669
    .bdrv_co_invalidate_cache       = bdrv_qed_co_invalidate_cache,
1670
    .bdrv_co_check                  = bdrv_qed_co_check,
1671
    .bdrv_detach_aio_context        = bdrv_qed_detach_aio_context,
1672
    .bdrv_attach_aio_context        = bdrv_qed_attach_aio_context,
1673
    .bdrv_drain_begin               = bdrv_qed_drain_begin,
1674
};
1675

1676
static void bdrv_qed_init(void)
1677
{
1678
    bdrv_register(&bdrv_qed);
1679
}
1680

1681
block_init(bdrv_qed_init);
1682
qemu

Использование cookies