qemu

Форк
0
/
block-backend.c 
2899 строк · 76.8 Кб
1
/*
2
 * QEMU Block backends
3
 *
4
 * Copyright (C) 2014-2016 Red Hat, Inc.
5
 *
6
 * Authors:
7
 *  Markus Armbruster <armbru@redhat.com>,
8
 *
9
 * This work is licensed under the terms of the GNU LGPL, version 2.1
10
 * or later.  See the COPYING.LIB file in the top-level directory.
11
 */
12

13
#include "qemu/osdep.h"
14
#include "sysemu/block-backend.h"
15
#include "block/block_int.h"
16
#include "block/blockjob.h"
17
#include "block/coroutines.h"
18
#include "block/throttle-groups.h"
19
#include "hw/qdev-core.h"
20
#include "sysemu/blockdev.h"
21
#include "sysemu/runstate.h"
22
#include "sysemu/replay.h"
23
#include "qapi/error.h"
24
#include "qapi/qapi-events-block.h"
25
#include "qemu/id.h"
26
#include "qemu/main-loop.h"
27
#include "qemu/option.h"
28
#include "trace.h"
29
#include "migration/misc.h"
30

31
/* Number of coroutines to reserve per attached device model */
32
#define COROUTINE_POOL_RESERVATION 64
33

34
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
35

36
typedef struct BlockBackendAioNotifier {
37
    void (*attached_aio_context)(AioContext *new_context, void *opaque);
38
    void (*detach_aio_context)(void *opaque);
39
    void *opaque;
40
    QLIST_ENTRY(BlockBackendAioNotifier) list;
41
} BlockBackendAioNotifier;
42

43
struct BlockBackend {
44
    char *name;
45
    int refcnt;
46
    BdrvChild *root;
47
    AioContext *ctx; /* access with atomic operations only */
48
    DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
49
    QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
50
    QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
51
    BlockBackendPublic public;
52

53
    DeviceState *dev;           /* attached device model, if any */
54
    const BlockDevOps *dev_ops;
55
    void *dev_opaque;
56

57
    /* If the BDS tree is removed, some of its options are stored here (which
58
     * can be used to restore those options in the new BDS on insert) */
59
    BlockBackendRootState root_state;
60

61
    bool enable_write_cache;
62

63
    /* I/O stats (display with "info blockstats"). */
64
    BlockAcctStats stats;
65

66
    BlockdevOnError on_read_error, on_write_error;
67
    bool iostatus_enabled;
68
    BlockDeviceIoStatus iostatus;
69

70
    uint64_t perm;
71
    uint64_t shared_perm;
72
    bool disable_perm;
73

74
    bool allow_aio_context_change;
75
    bool allow_write_beyond_eof;
76

77
    /* Protected by BQL */
78
    NotifierList remove_bs_notifiers, insert_bs_notifiers;
79
    QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
80

81
    int quiesce_counter; /* atomic: written under BQL, read by other threads */
82
    QemuMutex queued_requests_lock; /* protects queued_requests */
83
    CoQueue queued_requests;
84
    bool disable_request_queuing; /* atomic */
85

86
    VMChangeStateEntry *vmsh;
87
    bool force_allow_inactivate;
88

89
    /* Number of in-flight aio requests.  BlockDriverState also counts
90
     * in-flight requests but aio requests can exist even when blk->root is
91
     * NULL, so we cannot rely on its counter for that case.
92
     * Accessed with atomic ops.
93
     */
94
    unsigned int in_flight;
95
};
96

97
typedef struct BlockBackendAIOCB {
98
    BlockAIOCB common;
99
    BlockBackend *blk;
100
    int ret;
101
} BlockBackendAIOCB;
102

103
static const AIOCBInfo block_backend_aiocb_info = {
104
    .aiocb_size = sizeof(BlockBackendAIOCB),
105
};
106

107
static void drive_info_del(DriveInfo *dinfo);
108
static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
109

110
/* All BlockBackends. Protected by BQL. */
111
static QTAILQ_HEAD(, BlockBackend) block_backends =
112
    QTAILQ_HEAD_INITIALIZER(block_backends);
113

114
/*
115
 * All BlockBackends referenced by the monitor and which are iterated through by
116
 * blk_next(). Protected by BQL.
117
 */
118
static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
119
    QTAILQ_HEAD_INITIALIZER(monitor_block_backends);
120

121
static int coroutine_mixed_fn GRAPH_RDLOCK
122
blk_set_perm_locked(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
123
                    Error **errp);
124

125
static void blk_root_inherit_options(BdrvChildRole role, bool parent_is_format,
126
                                     int *child_flags, QDict *child_options,
127
                                     int parent_flags, QDict *parent_options)
128
{
129
    /* We're not supposed to call this function for root nodes */
130
    abort();
131
}
132
static void blk_root_drained_begin(BdrvChild *child);
133
static bool blk_root_drained_poll(BdrvChild *child);
134
static void blk_root_drained_end(BdrvChild *child);
135

136
static void blk_root_change_media(BdrvChild *child, bool load);
137
static void blk_root_resize(BdrvChild *child);
138

139
static bool blk_root_change_aio_ctx(BdrvChild *child, AioContext *ctx,
140
                                    GHashTable *visited, Transaction *tran,
141
                                    Error **errp);
142

143
static char *blk_root_get_parent_desc(BdrvChild *child)
144
{
145
    BlockBackend *blk = child->opaque;
146
    g_autofree char *dev_id = NULL;
147

148
    if (blk->name) {
149
        return g_strdup_printf("block device '%s'", blk->name);
150
    }
151

152
    dev_id = blk_get_attached_dev_id(blk);
153
    if (*dev_id) {
154
        return g_strdup_printf("block device '%s'", dev_id);
155
    } else {
156
        /* TODO Callback into the BB owner for something more detailed */
157
        return g_strdup("an unnamed block device");
158
    }
159
}
160

161
static const char *blk_root_get_name(BdrvChild *child)
162
{
163
    return blk_name(child->opaque);
164
}
165

166
static void blk_vm_state_changed(void *opaque, bool running, RunState state)
167
{
168
    Error *local_err = NULL;
169
    BlockBackend *blk = opaque;
170

171
    if (state == RUN_STATE_INMIGRATE) {
172
        return;
173
    }
174

175
    qemu_del_vm_change_state_handler(blk->vmsh);
176
    blk->vmsh = NULL;
177
    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
178
    if (local_err) {
179
        error_report_err(local_err);
180
    }
181
}
182

183
/*
184
 * Notifies the user of the BlockBackend that migration has completed. qdev
185
 * devices can tighten their permissions in response (specifically revoke
186
 * shared write permissions that we needed for storage migration).
187
 *
188
 * If an error is returned, the VM cannot be allowed to be resumed.
189
 */
190
static void GRAPH_RDLOCK blk_root_activate(BdrvChild *child, Error **errp)
191
{
192
    BlockBackend *blk = child->opaque;
193
    Error *local_err = NULL;
194
    uint64_t saved_shared_perm;
195

196
    if (!blk->disable_perm) {
197
        return;
198
    }
199

200
    blk->disable_perm = false;
201

202
    /*
203
     * blk->shared_perm contains the permissions we want to share once
204
     * migration is really completely done.  For now, we need to share
205
     * all; but we also need to retain blk->shared_perm, which is
206
     * overwritten by a successful blk_set_perm() call.  Save it and
207
     * restore it below.
208
     */
209
    saved_shared_perm = blk->shared_perm;
210

211
    blk_set_perm_locked(blk, blk->perm, BLK_PERM_ALL, &local_err);
212
    if (local_err) {
213
        error_propagate(errp, local_err);
214
        blk->disable_perm = true;
215
        return;
216
    }
217
    blk->shared_perm = saved_shared_perm;
218

219
    if (runstate_check(RUN_STATE_INMIGRATE)) {
220
        /* Activation can happen when migration process is still active, for
221
         * example when nbd_server_add is called during non-shared storage
222
         * migration. Defer the shared_perm update to migration completion. */
223
        if (!blk->vmsh) {
224
            blk->vmsh = qemu_add_vm_change_state_handler(blk_vm_state_changed,
225
                                                         blk);
226
        }
227
        return;
228
    }
229

230
    blk_set_perm_locked(blk, blk->perm, blk->shared_perm, &local_err);
231
    if (local_err) {
232
        error_propagate(errp, local_err);
233
        blk->disable_perm = true;
234
        return;
235
    }
236
}
237

238
void blk_set_force_allow_inactivate(BlockBackend *blk)
239
{
240
    GLOBAL_STATE_CODE();
241
    blk->force_allow_inactivate = true;
242
}
243

244
static bool blk_can_inactivate(BlockBackend *blk)
245
{
246
    /* If it is a guest device, inactivate is ok. */
247
    if (blk->dev || blk_name(blk)[0]) {
248
        return true;
249
    }
250

251
    /* Inactivating means no more writes to the image can be done,
252
     * even if those writes would be changes invisible to the
253
     * guest.  For block job BBs that satisfy this, we can just allow
254
     * it.  This is the case for mirror job source, which is required
255
     * by libvirt non-shared block migration. */
256
    if (!(blk->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED))) {
257
        return true;
258
    }
259

260
    return blk->force_allow_inactivate;
261
}
262

263
static int GRAPH_RDLOCK blk_root_inactivate(BdrvChild *child)
264
{
265
    BlockBackend *blk = child->opaque;
266

267
    if (blk->disable_perm) {
268
        return 0;
269
    }
270

271
    if (!blk_can_inactivate(blk)) {
272
        return -EPERM;
273
    }
274

275
    blk->disable_perm = true;
276
    if (blk->root) {
277
        bdrv_child_try_set_perm(blk->root, 0, BLK_PERM_ALL, &error_abort);
278
    }
279

280
    return 0;
281
}
282

283
static void blk_root_attach(BdrvChild *child)
284
{
285
    BlockBackend *blk = child->opaque;
286
    BlockBackendAioNotifier *notifier;
287

288
    trace_blk_root_attach(child, blk, child->bs);
289

290
    QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
291
        bdrv_add_aio_context_notifier(child->bs,
292
                notifier->attached_aio_context,
293
                notifier->detach_aio_context,
294
                notifier->opaque);
295
    }
296
}
297

298
static void blk_root_detach(BdrvChild *child)
299
{
300
    BlockBackend *blk = child->opaque;
301
    BlockBackendAioNotifier *notifier;
302

303
    trace_blk_root_detach(child, blk, child->bs);
304

305
    QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
306
        bdrv_remove_aio_context_notifier(child->bs,
307
                notifier->attached_aio_context,
308
                notifier->detach_aio_context,
309
                notifier->opaque);
310
    }
311
}
312

313
static AioContext *blk_root_get_parent_aio_context(BdrvChild *c)
314
{
315
    BlockBackend *blk = c->opaque;
316
    IO_CODE();
317

318
    return blk_get_aio_context(blk);
319
}
320

321
static const BdrvChildClass child_root = {
322
    .inherit_options    = blk_root_inherit_options,
323

324
    .change_media       = blk_root_change_media,
325
    .resize             = blk_root_resize,
326
    .get_name           = blk_root_get_name,
327
    .get_parent_desc    = blk_root_get_parent_desc,
328

329
    .drained_begin      = blk_root_drained_begin,
330
    .drained_poll       = blk_root_drained_poll,
331
    .drained_end        = blk_root_drained_end,
332

333
    .activate           = blk_root_activate,
334
    .inactivate         = blk_root_inactivate,
335

336
    .attach             = blk_root_attach,
337
    .detach             = blk_root_detach,
338

339
    .change_aio_ctx     = blk_root_change_aio_ctx,
340

341
    .get_parent_aio_context = blk_root_get_parent_aio_context,
342
};
343

344
/*
345
 * Create a new BlockBackend with a reference count of one.
346
 *
347
 * @perm is a bitmasks of BLK_PERM_* constants which describes the permissions
348
 * to request for a block driver node that is attached to this BlockBackend.
349
 * @shared_perm is a bitmask which describes which permissions may be granted
350
 * to other users of the attached node.
351
 * Both sets of permissions can be changed later using blk_set_perm().
352
 *
353
 * Return the new BlockBackend on success, null on failure.
354
 */
355
BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
356
{
357
    BlockBackend *blk;
358

359
    GLOBAL_STATE_CODE();
360

361
    blk = g_new0(BlockBackend, 1);
362
    blk->refcnt = 1;
363
    blk->ctx = ctx;
364
    blk->perm = perm;
365
    blk->shared_perm = shared_perm;
366
    blk_set_enable_write_cache(blk, true);
367

368
    blk->on_read_error = BLOCKDEV_ON_ERROR_REPORT;
369
    blk->on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
370

371
    block_acct_init(&blk->stats);
372

373
    qemu_mutex_init(&blk->queued_requests_lock);
374
    qemu_co_queue_init(&blk->queued_requests);
375
    notifier_list_init(&blk->remove_bs_notifiers);
376
    notifier_list_init(&blk->insert_bs_notifiers);
377
    QLIST_INIT(&blk->aio_notifiers);
378

379
    QTAILQ_INSERT_TAIL(&block_backends, blk, link);
380
    return blk;
381
}
382

383
/*
384
 * Create a new BlockBackend connected to an existing BlockDriverState.
385
 *
386
 * @perm is a bitmasks of BLK_PERM_* constants which describes the
387
 * permissions to request for @bs that is attached to this
388
 * BlockBackend.  @shared_perm is a bitmask which describes which
389
 * permissions may be granted to other users of the attached node.
390
 * Both sets of permissions can be changed later using blk_set_perm().
391
 *
392
 * Return the new BlockBackend on success, null on failure.
393
 */
394
BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
395
                              uint64_t shared_perm, Error **errp)
396
{
397
    BlockBackend *blk = blk_new(bdrv_get_aio_context(bs), perm, shared_perm);
398

399
    GLOBAL_STATE_CODE();
400

401
    if (blk_insert_bs(blk, bs, errp) < 0) {
402
        blk_unref(blk);
403
        return NULL;
404
    }
405
    return blk;
406
}
407

408
/*
409
 * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
410
 * By default, the new BlockBackend is in the main AioContext, but if the
411
 * parameters connect it with any existing node in a different AioContext, it
412
 * may end up there instead.
413
 *
414
 * Just as with bdrv_open(), after having called this function the reference to
415
 * @options belongs to the block layer (even on failure).
416
 *
417
 * TODO: Remove @filename and @flags; it should be possible to specify a whole
418
 * BDS tree just by specifying the @options QDict (or @reference,
419
 * alternatively). At the time of adding this function, this is not possible,
420
 * though, so callers of this function have to be able to specify @filename and
421
 * @flags.
422
 */
423
BlockBackend *blk_new_open(const char *filename, const char *reference,
424
                           QDict *options, int flags, Error **errp)
425
{
426
    BlockBackend *blk;
427
    BlockDriverState *bs;
428
    uint64_t perm = 0;
429
    uint64_t shared = BLK_PERM_ALL;
430

431
    GLOBAL_STATE_CODE();
432

433
    /*
434
     * blk_new_open() is mainly used in .bdrv_create implementations and the
435
     * tools where sharing isn't a major concern because the BDS stays private
436
     * and the file is generally not supposed to be used by a second process,
437
     * so we just request permission according to the flags.
438
     *
439
     * The exceptions are xen_disk and blockdev_init(); in these cases, the
440
     * caller of blk_new_open() doesn't make use of the permissions, but they
441
     * shouldn't hurt either. We can still share everything here because the
442
     * guest devices will add their own blockers if they can't share.
443
     */
444
    if ((flags & BDRV_O_NO_IO) == 0) {
445
        perm |= BLK_PERM_CONSISTENT_READ;
446
        if (flags & BDRV_O_RDWR) {
447
            perm |= BLK_PERM_WRITE;
448
        }
449
    }
450
    if (flags & BDRV_O_RESIZE) {
451
        perm |= BLK_PERM_RESIZE;
452
    }
453
    if (flags & BDRV_O_NO_SHARE) {
454
        shared = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
455
    }
456

457
    bs = bdrv_open(filename, reference, options, flags, errp);
458
    if (!bs) {
459
        return NULL;
460
    }
461

462
    /* bdrv_open() could have moved bs to a different AioContext */
463
    blk = blk_new(bdrv_get_aio_context(bs), perm, shared);
464
    blk->perm = perm;
465
    blk->shared_perm = shared;
466

467
    blk_insert_bs(blk, bs, errp);
468
    bdrv_unref(bs);
469

470
    if (!blk->root) {
471
        blk_unref(blk);
472
        return NULL;
473
    }
474

475
    return blk;
476
}
477

478
static void blk_delete(BlockBackend *blk)
479
{
480
    assert(!blk->refcnt);
481
    assert(!blk->name);
482
    assert(!blk->dev);
483
    if (blk->public.throttle_group_member.throttle_state) {
484
        blk_io_limits_disable(blk);
485
    }
486
    if (blk->root) {
487
        blk_remove_bs(blk);
488
    }
489
    if (blk->vmsh) {
490
        qemu_del_vm_change_state_handler(blk->vmsh);
491
        blk->vmsh = NULL;
492
    }
493
    assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
494
    assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
495
    assert(QLIST_EMPTY(&blk->aio_notifiers));
496
    assert(qemu_co_queue_empty(&blk->queued_requests));
497
    qemu_mutex_destroy(&blk->queued_requests_lock);
498
    QTAILQ_REMOVE(&block_backends, blk, link);
499
    drive_info_del(blk->legacy_dinfo);
500
    block_acct_cleanup(&blk->stats);
501
    g_free(blk);
502
}
503

504
static void drive_info_del(DriveInfo *dinfo)
505
{
506
    if (!dinfo) {
507
        return;
508
    }
509
    qemu_opts_del(dinfo->opts);
510
    g_free(dinfo);
511
}
512

513
int blk_get_refcnt(BlockBackend *blk)
514
{
515
    GLOBAL_STATE_CODE();
516
    return blk ? blk->refcnt : 0;
517
}
518

519
/*
520
 * Increment @blk's reference count.
521
 * @blk must not be null.
522
 */
523
void blk_ref(BlockBackend *blk)
524
{
525
    assert(blk->refcnt > 0);
526
    GLOBAL_STATE_CODE();
527
    blk->refcnt++;
528
}
529

530
/*
531
 * Decrement @blk's reference count.
532
 * If this drops it to zero, destroy @blk.
533
 * For convenience, do nothing if @blk is null.
534
 */
535
void blk_unref(BlockBackend *blk)
536
{
537
    GLOBAL_STATE_CODE();
538
    if (blk) {
539
        assert(blk->refcnt > 0);
540
        if (blk->refcnt > 1) {
541
            blk->refcnt--;
542
        } else {
543
            blk_drain(blk);
544
            /* blk_drain() cannot resurrect blk, nobody held a reference */
545
            assert(blk->refcnt == 1);
546
            blk->refcnt = 0;
547
            blk_delete(blk);
548
        }
549
    }
550
}
551

552
/*
553
 * Behaves similarly to blk_next() but iterates over all BlockBackends, even the
554
 * ones which are hidden (i.e. are not referenced by the monitor).
555
 */
556
BlockBackend *blk_all_next(BlockBackend *blk)
557
{
558
    GLOBAL_STATE_CODE();
559
    return blk ? QTAILQ_NEXT(blk, link)
560
               : QTAILQ_FIRST(&block_backends);
561
}
562

563
void blk_remove_all_bs(void)
564
{
565
    BlockBackend *blk = NULL;
566

567
    GLOBAL_STATE_CODE();
568

569
    while ((blk = blk_all_next(blk)) != NULL) {
570
        if (blk->root) {
571
            blk_remove_bs(blk);
572
        }
573
    }
574
}
575

576
/*
577
 * Return the monitor-owned BlockBackend after @blk.
578
 * If @blk is null, return the first one.
579
 * Else, return @blk's next sibling, which may be null.
580
 *
581
 * To iterate over all BlockBackends, do
582
 * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
583
 *     ...
584
 * }
585
 */
586
BlockBackend *blk_next(BlockBackend *blk)
587
{
588
    GLOBAL_STATE_CODE();
589
    return blk ? QTAILQ_NEXT(blk, monitor_link)
590
               : QTAILQ_FIRST(&monitor_block_backends);
591
}
592

593
/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
594
 * the monitor or attached to a BlockBackend */
595
BlockDriverState *bdrv_next(BdrvNextIterator *it)
596
{
597
    BlockDriverState *bs, *old_bs;
598

599
    /* Must be called from the main loop */
600
    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
601

602
    old_bs = it->bs;
603

604
    /* First, return all root nodes of BlockBackends. In order to avoid
605
     * returning a BDS twice when multiple BBs refer to it, we only return it
606
     * if the BB is the first one in the parent list of the BDS. */
607
    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
608
        BlockBackend *old_blk = it->blk;
609

610
        do {
611
            it->blk = blk_all_next(it->blk);
612
            bs = it->blk ? blk_bs(it->blk) : NULL;
613
        } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
614

615
        if (it->blk) {
616
            blk_ref(it->blk);
617
        }
618
        blk_unref(old_blk);
619

620
        if (bs) {
621
            bdrv_ref(bs);
622
            bdrv_unref(old_bs);
623
            it->bs = bs;
624
            return bs;
625
        }
626
        it->phase = BDRV_NEXT_MONITOR_OWNED;
627
    }
628

629
    /* Then return the monitor-owned BDSes without a BB attached. Ignore all
630
     * BDSes that are attached to a BlockBackend here; they have been handled
631
     * by the above block already */
632
    do {
633
        it->bs = bdrv_next_monitor_owned(it->bs);
634
        bs = it->bs;
635
    } while (bs && bdrv_has_blk(bs));
636

637
    if (bs) {
638
        bdrv_ref(bs);
639
    }
640
    bdrv_unref(old_bs);
641

642
    return bs;
643
}
644

645
static void bdrv_next_reset(BdrvNextIterator *it)
646
{
647
    *it = (BdrvNextIterator) {
648
        .phase = BDRV_NEXT_BACKEND_ROOTS,
649
    };
650
}
651

652
BlockDriverState *bdrv_first(BdrvNextIterator *it)
653
{
654
    GLOBAL_STATE_CODE();
655
    bdrv_next_reset(it);
656
    return bdrv_next(it);
657
}
658

659
/* Must be called when aborting a bdrv_next() iteration before
660
 * bdrv_next() returns NULL */
661
void bdrv_next_cleanup(BdrvNextIterator *it)
662
{
663
    /* Must be called from the main loop */
664
    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
665

666
    bdrv_unref(it->bs);
667

668
    if (it->phase == BDRV_NEXT_BACKEND_ROOTS && it->blk) {
669
        blk_unref(it->blk);
670
    }
671

672
    bdrv_next_reset(it);
673
}
674

675
/*
676
 * Add a BlockBackend into the list of backends referenced by the monitor, with
677
 * the given @name acting as the handle for the monitor.
678
 * Strictly for use by blockdev.c.
679
 *
680
 * @name must not be null or empty.
681
 *
682
 * Returns true on success and false on failure. In the latter case, an Error
683
 * object is returned through @errp.
684
 */
685
bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp)
686
{
687
    assert(!blk->name);
688
    assert(name && name[0]);
689
    GLOBAL_STATE_CODE();
690

691
    if (!id_wellformed(name)) {
692
        error_setg(errp, "Invalid device name");
693
        return false;
694
    }
695
    if (blk_by_name(name)) {
696
        error_setg(errp, "Device with id '%s' already exists", name);
697
        return false;
698
    }
699
    if (bdrv_find_node(name)) {
700
        error_setg(errp,
701
                   "Device name '%s' conflicts with an existing node name",
702
                   name);
703
        return false;
704
    }
705

706
    blk->name = g_strdup(name);
707
    QTAILQ_INSERT_TAIL(&monitor_block_backends, blk, monitor_link);
708
    return true;
709
}
710

711
/*
712
 * Remove a BlockBackend from the list of backends referenced by the monitor.
713
 * Strictly for use by blockdev.c.
714
 */
715
void monitor_remove_blk(BlockBackend *blk)
716
{
717
    GLOBAL_STATE_CODE();
718

719
    if (!blk->name) {
720
        return;
721
    }
722

723
    QTAILQ_REMOVE(&monitor_block_backends, blk, monitor_link);
724
    g_free(blk->name);
725
    blk->name = NULL;
726
}
727

728
/*
729
 * Return @blk's name, a non-null string.
730
 * Returns an empty string iff @blk is not referenced by the monitor.
731
 */
732
const char *blk_name(const BlockBackend *blk)
733
{
734
    IO_CODE();
735
    return blk->name ?: "";
736
}
737

738
/*
739
 * Return the BlockBackend with name @name if it exists, else null.
740
 * @name must not be null.
741
 */
742
BlockBackend *blk_by_name(const char *name)
743
{
744
    BlockBackend *blk = NULL;
745

746
    GLOBAL_STATE_CODE();
747
    assert(name);
748
    while ((blk = blk_next(blk)) != NULL) {
749
        if (!strcmp(name, blk->name)) {
750
            return blk;
751
        }
752
    }
753
    return NULL;
754
}
755

756
/*
757
 * Return the BlockDriverState attached to @blk if any, else null.
758
 */
759
BlockDriverState *blk_bs(BlockBackend *blk)
760
{
761
    IO_CODE();
762
    return blk->root ? blk->root->bs : NULL;
763
}
764

765
static BlockBackend * GRAPH_RDLOCK bdrv_first_blk(BlockDriverState *bs)
766
{
767
    BdrvChild *child;
768

769
    GLOBAL_STATE_CODE();
770
    assert_bdrv_graph_readable();
771

772
    QLIST_FOREACH(child, &bs->parents, next_parent) {
773
        if (child->klass == &child_root) {
774
            return child->opaque;
775
        }
776
    }
777

778
    return NULL;
779
}
780

781
/*
782
 * Returns true if @bs has an associated BlockBackend.
783
 */
784
bool bdrv_has_blk(BlockDriverState *bs)
785
{
786
    GLOBAL_STATE_CODE();
787
    return bdrv_first_blk(bs) != NULL;
788
}
789

790
/*
791
 * Returns true if @bs has only BlockBackends as parents.
792
 */
793
bool bdrv_is_root_node(BlockDriverState *bs)
794
{
795
    BdrvChild *c;
796

797
    GLOBAL_STATE_CODE();
798
    assert_bdrv_graph_readable();
799

800
    QLIST_FOREACH(c, &bs->parents, next_parent) {
801
        if (c->klass != &child_root) {
802
            return false;
803
        }
804
    }
805

806
    return true;
807
}
808

809
/*
810
 * Return @blk's DriveInfo if any, else null.
811
 */
812
DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
813
{
814
    GLOBAL_STATE_CODE();
815
    return blk->legacy_dinfo;
816
}
817

818
/*
819
 * Set @blk's DriveInfo to @dinfo, and return it.
820
 * @blk must not have a DriveInfo set already.
821
 * No other BlockBackend may have the same DriveInfo set.
822
 */
823
DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
824
{
825
    assert(!blk->legacy_dinfo);
826
    GLOBAL_STATE_CODE();
827
    return blk->legacy_dinfo = dinfo;
828
}
829

830
/*
831
 * Return the BlockBackend with DriveInfo @dinfo.
832
 * It must exist.
833
 */
834
BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
835
{
836
    BlockBackend *blk = NULL;
837
    GLOBAL_STATE_CODE();
838

839
    while ((blk = blk_next(blk)) != NULL) {
840
        if (blk->legacy_dinfo == dinfo) {
841
            return blk;
842
        }
843
    }
844
    abort();
845
}
846

847
/*
848
 * Returns a pointer to the publicly accessible fields of @blk.
849
 */
850
BlockBackendPublic *blk_get_public(BlockBackend *blk)
851
{
852
    GLOBAL_STATE_CODE();
853
    return &blk->public;
854
}
855

856
/*
857
 * Returns a BlockBackend given the associated @public fields.
858
 */
859
BlockBackend *blk_by_public(BlockBackendPublic *public)
860
{
861
    GLOBAL_STATE_CODE();
862
    return container_of(public, BlockBackend, public);
863
}
864

865
/*
866
 * Disassociates the currently associated BlockDriverState from @blk.
867
 */
868
void blk_remove_bs(BlockBackend *blk)
869
{
870
    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
871
    BdrvChild *root;
872

873
    GLOBAL_STATE_CODE();
874

875
    notifier_list_notify(&blk->remove_bs_notifiers, blk);
876
    if (tgm->throttle_state) {
877
        BlockDriverState *bs = blk_bs(blk);
878

879
        /*
880
         * Take a ref in case blk_bs() changes across bdrv_drained_begin(), for
881
         * example, if a temporary filter node is removed by a blockjob.
882
         */
883
        bdrv_ref(bs);
884
        bdrv_drained_begin(bs);
885
        throttle_group_detach_aio_context(tgm);
886
        throttle_group_attach_aio_context(tgm, qemu_get_aio_context());
887
        bdrv_drained_end(bs);
888
        bdrv_unref(bs);
889
    }
890

891
    blk_update_root_state(blk);
892

893
    /* bdrv_root_unref_child() will cause blk->root to become stale and may
894
     * switch to a completion coroutine later on. Let's drain all I/O here
895
     * to avoid that and a potential QEMU crash.
896
     */
897
    blk_drain(blk);
898
    root = blk->root;
899
    blk->root = NULL;
900

901
    bdrv_graph_wrlock();
902
    bdrv_root_unref_child(root);
903
    bdrv_graph_wrunlock();
904
}
905

906
/*
907
 * Associates a new BlockDriverState with @blk.
908
 */
909
int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
910
{
911
    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
912

913
    GLOBAL_STATE_CODE();
914
    bdrv_ref(bs);
915
    bdrv_graph_wrlock();
916
    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
917
                                       BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
918
                                       blk->perm, blk->shared_perm,
919
                                       blk, errp);
920
    bdrv_graph_wrunlock();
921
    if (blk->root == NULL) {
922
        return -EPERM;
923
    }
924

925
    notifier_list_notify(&blk->insert_bs_notifiers, blk);
926
    if (tgm->throttle_state) {
927
        throttle_group_detach_aio_context(tgm);
928
        throttle_group_attach_aio_context(tgm, bdrv_get_aio_context(bs));
929
    }
930

931
    return 0;
932
}
933

934
/*
935
 * Change BlockDriverState associated with @blk.
936
 */
937
int blk_replace_bs(BlockBackend *blk, BlockDriverState *new_bs, Error **errp)
938
{
939
    GLOBAL_STATE_CODE();
940
    return bdrv_replace_child_bs(blk->root, new_bs, errp);
941
}
942

943
/*
944
 * Sets the permission bitmasks that the user of the BlockBackend needs.
945
 */
946
static int coroutine_mixed_fn GRAPH_RDLOCK
947
blk_set_perm_locked(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
948
                    Error **errp)
949
{
950
    int ret;
951
    GLOBAL_STATE_CODE();
952

953
    if (blk->root && !blk->disable_perm) {
954
        ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp);
955
        if (ret < 0) {
956
            return ret;
957
        }
958
    }
959

960
    blk->perm = perm;
961
    blk->shared_perm = shared_perm;
962

963
    return 0;
964
}
965

966
int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
967
                 Error **errp)
968
{
969
    GLOBAL_STATE_CODE();
970
    GRAPH_RDLOCK_GUARD_MAINLOOP();
971

972
    return blk_set_perm_locked(blk, perm, shared_perm, errp);
973
}
974

975
void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
976
{
977
    GLOBAL_STATE_CODE();
978
    *perm = blk->perm;
979
    *shared_perm = blk->shared_perm;
980
}
981

982
/*
983
 * Attach device model @dev to @blk.
984
 * Return 0 on success, -EBUSY when a device model is attached already.
985
 */
986
int blk_attach_dev(BlockBackend *blk, DeviceState *dev)
987
{
988
    GLOBAL_STATE_CODE();
989
    if (blk->dev) {
990
        return -EBUSY;
991
    }
992

993
    /* While migration is still incoming, we don't need to apply the
994
     * permissions of guest device BlockBackends. We might still have a block
995
     * job or NBD server writing to the image for storage migration. */
996
    if (runstate_check(RUN_STATE_INMIGRATE)) {
997
        blk->disable_perm = true;
998
    }
999

1000
    blk_ref(blk);
1001
    blk->dev = dev;
1002
    blk_iostatus_reset(blk);
1003

1004
    return 0;
1005
}
1006

1007
/*
1008
 * Detach device model @dev from @blk.
1009
 * @dev must be currently attached to @blk.
1010
 */
1011
void blk_detach_dev(BlockBackend *blk, DeviceState *dev)
1012
{
1013
    assert(blk->dev == dev);
1014
    GLOBAL_STATE_CODE();
1015
    blk->dev = NULL;
1016
    blk->dev_ops = NULL;
1017
    blk->dev_opaque = NULL;
1018
    blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
1019
    blk_unref(blk);
1020
}
1021

1022
/*
1023
 * Return the device model attached to @blk if any, else null.
1024
 */
1025
DeviceState *blk_get_attached_dev(BlockBackend *blk)
1026
{
1027
    GLOBAL_STATE_CODE();
1028
    return blk->dev;
1029
}
1030

1031
/* Return the qdev ID, or if no ID is assigned the QOM path, of the block
1032
 * device attached to the BlockBackend. */
1033
char *blk_get_attached_dev_id(BlockBackend *blk)
1034
{
1035
    DeviceState *dev = blk->dev;
1036
    IO_CODE();
1037

1038
    if (!dev) {
1039
        return g_strdup("");
1040
    } else if (dev->id) {
1041
        return g_strdup(dev->id);
1042
    }
1043

1044
    return object_get_canonical_path(OBJECT(dev)) ?: g_strdup("");
1045
}
1046

1047
/*
1048
 * Return the BlockBackend which has the device model @dev attached if it
1049
 * exists, else null.
1050
 *
1051
 * @dev must not be null.
1052
 */
1053
BlockBackend *blk_by_dev(void *dev)
1054
{
1055
    BlockBackend *blk = NULL;
1056

1057
    GLOBAL_STATE_CODE();
1058

1059
    assert(dev != NULL);
1060
    while ((blk = blk_all_next(blk)) != NULL) {
1061
        if (blk->dev == dev) {
1062
            return blk;
1063
        }
1064
    }
1065
    return NULL;
1066
}
1067

1068
/*
1069
 * Set @blk's device model callbacks to @ops.
1070
 * @opaque is the opaque argument to pass to the callbacks.
1071
 * This is for use by device models.
1072
 */
1073
void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
1074
                     void *opaque)
1075
{
1076
    GLOBAL_STATE_CODE();
1077
    blk->dev_ops = ops;
1078
    blk->dev_opaque = opaque;
1079

1080
    /* Are we currently quiesced? Should we enforce this right now? */
1081
    if (qatomic_read(&blk->quiesce_counter) && ops && ops->drained_begin) {
1082
        ops->drained_begin(opaque);
1083
    }
1084
}
1085

1086
/*
1087
 * Notify @blk's attached device model of media change.
1088
 *
1089
 * If @load is true, notify of media load. This action can fail, meaning that
1090
 * the medium cannot be loaded. @errp is set then.
1091
 *
1092
 * If @load is false, notify of media eject. This can never fail.
1093
 *
1094
 * Also send DEVICE_TRAY_MOVED events as appropriate.
1095
 */
1096
void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp)
1097
{
1098
    GLOBAL_STATE_CODE();
1099
    if (blk->dev_ops && blk->dev_ops->change_media_cb) {
1100
        bool tray_was_open, tray_is_open;
1101
        Error *local_err = NULL;
1102

1103
        tray_was_open = blk_dev_is_tray_open(blk);
1104
        blk->dev_ops->change_media_cb(blk->dev_opaque, load, &local_err);
1105
        if (local_err) {
1106
            assert(load == true);
1107
            error_propagate(errp, local_err);
1108
            return;
1109
        }
1110
        tray_is_open = blk_dev_is_tray_open(blk);
1111

1112
        if (tray_was_open != tray_is_open) {
1113
            char *id = blk_get_attached_dev_id(blk);
1114
            qapi_event_send_device_tray_moved(blk_name(blk), id, tray_is_open);
1115
            g_free(id);
1116
        }
1117
    }
1118
}
1119

1120
static void blk_root_change_media(BdrvChild *child, bool load)
1121
{
1122
    blk_dev_change_media_cb(child->opaque, load, NULL);
1123
}
1124

1125
/*
1126
 * Does @blk's attached device model have removable media?
1127
 * %true if no device model is attached.
1128
 */
1129
bool blk_dev_has_removable_media(BlockBackend *blk)
1130
{
1131
    GLOBAL_STATE_CODE();
1132
    return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
1133
}
1134

1135
/*
1136
 * Does @blk's attached device model have a tray?
1137
 */
1138
bool blk_dev_has_tray(BlockBackend *blk)
1139
{
1140
    IO_CODE();
1141
    return blk->dev_ops && blk->dev_ops->is_tray_open;
1142
}
1143

1144
/*
1145
 * Notify @blk's attached device model of a media eject request.
1146
 * If @force is true, the medium is about to be yanked out forcefully.
1147
 */
1148
void blk_dev_eject_request(BlockBackend *blk, bool force)
1149
{
1150
    GLOBAL_STATE_CODE();
1151
    if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
1152
        blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
1153
    }
1154
}
1155

1156
/*
1157
 * Does @blk's attached device model have a tray, and is it open?
1158
 */
1159
bool blk_dev_is_tray_open(BlockBackend *blk)
1160
{
1161
    IO_CODE();
1162
    if (blk_dev_has_tray(blk)) {
1163
        return blk->dev_ops->is_tray_open(blk->dev_opaque);
1164
    }
1165
    return false;
1166
}
1167

1168
/*
1169
 * Does @blk's attached device model have the medium locked?
1170
 * %false if the device model has no such lock.
1171
 */
1172
bool blk_dev_is_medium_locked(BlockBackend *blk)
1173
{
1174
    GLOBAL_STATE_CODE();
1175
    if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
1176
        return blk->dev_ops->is_medium_locked(blk->dev_opaque);
1177
    }
1178
    return false;
1179
}
1180

1181
/*
1182
 * Notify @blk's attached device model of a backend size change.
1183
 */
1184
static void blk_root_resize(BdrvChild *child)
1185
{
1186
    BlockBackend *blk = child->opaque;
1187

1188
    if (blk->dev_ops && blk->dev_ops->resize_cb) {
1189
        blk->dev_ops->resize_cb(blk->dev_opaque);
1190
    }
1191
}
1192

1193
void blk_iostatus_enable(BlockBackend *blk)
1194
{
1195
    GLOBAL_STATE_CODE();
1196
    blk->iostatus_enabled = true;
1197
    blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1198
}
1199

1200
/* The I/O status is only enabled if the drive explicitly
1201
 * enables it _and_ the VM is configured to stop on errors */
1202
bool blk_iostatus_is_enabled(const BlockBackend *blk)
1203
{
1204
    IO_CODE();
1205
    return (blk->iostatus_enabled &&
1206
           (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
1207
            blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
1208
            blk->on_read_error == BLOCKDEV_ON_ERROR_STOP));
1209
}
1210

1211
BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk)
1212
{
1213
    GLOBAL_STATE_CODE();
1214
    return blk->iostatus;
1215
}
1216

1217
void blk_iostatus_disable(BlockBackend *blk)
1218
{
1219
    GLOBAL_STATE_CODE();
1220
    blk->iostatus_enabled = false;
1221
}
1222

1223
void blk_iostatus_reset(BlockBackend *blk)
1224
{
1225
    GLOBAL_STATE_CODE();
1226
    if (blk_iostatus_is_enabled(blk)) {
1227
        blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1228
    }
1229
}
1230

1231
void blk_iostatus_set_err(BlockBackend *blk, int error)
1232
{
1233
    IO_CODE();
1234
    assert(blk_iostatus_is_enabled(blk));
1235
    if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
1236
        blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
1237
                                          BLOCK_DEVICE_IO_STATUS_FAILED;
1238
    }
1239
}
1240

1241
void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow)
1242
{
1243
    IO_CODE();
1244
    blk->allow_write_beyond_eof = allow;
1245
}
1246

1247
void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow)
1248
{
1249
    IO_CODE();
1250
    blk->allow_aio_context_change = allow;
1251
}
1252

1253
void blk_set_disable_request_queuing(BlockBackend *blk, bool disable)
1254
{
1255
    IO_CODE();
1256
    qatomic_set(&blk->disable_request_queuing, disable);
1257
}
1258

1259
static int coroutine_fn GRAPH_RDLOCK
1260
blk_check_byte_request(BlockBackend *blk, int64_t offset, int64_t bytes)
1261
{
1262
    int64_t len;
1263

1264
    if (bytes < 0) {
1265
        return -EIO;
1266
    }
1267

1268
    if (!blk_co_is_available(blk)) {
1269
        return -ENOMEDIUM;
1270
    }
1271

1272
    if (offset < 0) {
1273
        return -EIO;
1274
    }
1275

1276
    if (!blk->allow_write_beyond_eof) {
1277
        len = bdrv_co_getlength(blk_bs(blk));
1278
        if (len < 0) {
1279
            return len;
1280
        }
1281

1282
        if (offset > len || len - offset < bytes) {
1283
            return -EIO;
1284
        }
1285
    }
1286

1287
    return 0;
1288
}
1289

1290
/* Are we currently in a drained section? */
1291
bool blk_in_drain(BlockBackend *blk)
1292
{
1293
    GLOBAL_STATE_CODE(); /* change to IO_OR_GS_CODE(), if necessary */
1294
    return qatomic_read(&blk->quiesce_counter);
1295
}
1296

1297
/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1298
static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
1299
{
1300
    assert(blk->in_flight > 0);
1301

1302
    if (qatomic_read(&blk->quiesce_counter) &&
1303
        !qatomic_read(&blk->disable_request_queuing)) {
1304
        /*
1305
         * Take lock before decrementing in flight counter so main loop thread
1306
         * waits for us to enqueue ourselves before it can leave the drained
1307
         * section.
1308
         */
1309
        qemu_mutex_lock(&blk->queued_requests_lock);
1310
        blk_dec_in_flight(blk);
1311
        qemu_co_queue_wait(&blk->queued_requests, &blk->queued_requests_lock);
1312
        blk_inc_in_flight(blk);
1313
        qemu_mutex_unlock(&blk->queued_requests_lock);
1314
    }
1315
}
1316

1317
/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1318
static int coroutine_fn
1319
blk_co_do_preadv_part(BlockBackend *blk, int64_t offset, int64_t bytes,
1320
                      QEMUIOVector *qiov, size_t qiov_offset,
1321
                      BdrvRequestFlags flags)
1322
{
1323
    int ret;
1324
    BlockDriverState *bs;
1325
    IO_CODE();
1326

1327
    blk_wait_while_drained(blk);
1328
    GRAPH_RDLOCK_GUARD();
1329

1330
    /* Call blk_bs() only after waiting, the graph may have changed */
1331
    bs = blk_bs(blk);
1332
    trace_blk_co_preadv(blk, bs, offset, bytes, flags);
1333

1334
    ret = blk_check_byte_request(blk, offset, bytes);
1335
    if (ret < 0) {
1336
        return ret;
1337
    }
1338

1339
    bdrv_inc_in_flight(bs);
1340

1341
    /* throttling disk I/O */
1342
    if (blk->public.throttle_group_member.throttle_state) {
1343
        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1344
                bytes, THROTTLE_READ);
1345
    }
1346

1347
    ret = bdrv_co_preadv_part(blk->root, offset, bytes, qiov, qiov_offset,
1348
                              flags);
1349
    bdrv_dec_in_flight(bs);
1350
    return ret;
1351
}
1352

1353
int coroutine_fn blk_co_pread(BlockBackend *blk, int64_t offset, int64_t bytes,
1354
                              void *buf, BdrvRequestFlags flags)
1355
{
1356
    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1357
    IO_OR_GS_CODE();
1358

1359
    assert(bytes <= SIZE_MAX);
1360

1361
    return blk_co_preadv(blk, offset, bytes, &qiov, flags);
1362
}
1363

1364
int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
1365
                               int64_t bytes, QEMUIOVector *qiov,
1366
                               BdrvRequestFlags flags)
1367
{
1368
    int ret;
1369
    IO_OR_GS_CODE();
1370

1371
    blk_inc_in_flight(blk);
1372
    ret = blk_co_do_preadv_part(blk, offset, bytes, qiov, 0, flags);
1373
    blk_dec_in_flight(blk);
1374

1375
    return ret;
1376
}
1377

1378
int coroutine_fn blk_co_preadv_part(BlockBackend *blk, int64_t offset,
1379
                                    int64_t bytes, QEMUIOVector *qiov,
1380
                                    size_t qiov_offset, BdrvRequestFlags flags)
1381
{
1382
    int ret;
1383
    IO_OR_GS_CODE();
1384

1385
    blk_inc_in_flight(blk);
1386
    ret = blk_co_do_preadv_part(blk, offset, bytes, qiov, qiov_offset, flags);
1387
    blk_dec_in_flight(blk);
1388

1389
    return ret;
1390
}
1391

1392
/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1393
static int coroutine_fn
1394
blk_co_do_pwritev_part(BlockBackend *blk, int64_t offset, int64_t bytes,
1395
                       QEMUIOVector *qiov, size_t qiov_offset,
1396
                       BdrvRequestFlags flags)
1397
{
1398
    int ret;
1399
    BlockDriverState *bs;
1400
    IO_CODE();
1401

1402
    blk_wait_while_drained(blk);
1403
    GRAPH_RDLOCK_GUARD();
1404

1405
    /* Call blk_bs() only after waiting, the graph may have changed */
1406
    bs = blk_bs(blk);
1407
    trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
1408

1409
    ret = blk_check_byte_request(blk, offset, bytes);
1410
    if (ret < 0) {
1411
        return ret;
1412
    }
1413

1414
    bdrv_inc_in_flight(bs);
1415
    /* throttling disk I/O */
1416
    if (blk->public.throttle_group_member.throttle_state) {
1417
        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1418
                bytes, THROTTLE_WRITE);
1419
    }
1420

1421
    if (!blk->enable_write_cache) {
1422
        flags |= BDRV_REQ_FUA;
1423
    }
1424

1425
    ret = bdrv_co_pwritev_part(blk->root, offset, bytes, qiov, qiov_offset,
1426
                               flags);
1427
    bdrv_dec_in_flight(bs);
1428
    return ret;
1429
}
1430

1431
int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
1432
                                     int64_t bytes,
1433
                                     QEMUIOVector *qiov, size_t qiov_offset,
1434
                                     BdrvRequestFlags flags)
1435
{
1436
    int ret;
1437
    IO_OR_GS_CODE();
1438

1439
    blk_inc_in_flight(blk);
1440
    ret = blk_co_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, flags);
1441
    blk_dec_in_flight(blk);
1442

1443
    return ret;
1444
}
1445

1446
int coroutine_fn blk_co_pwrite(BlockBackend *blk, int64_t offset, int64_t bytes,
1447
                               const void *buf, BdrvRequestFlags flags)
1448
{
1449
    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1450
    IO_OR_GS_CODE();
1451

1452
    assert(bytes <= SIZE_MAX);
1453

1454
    return blk_co_pwritev(blk, offset, bytes, &qiov, flags);
1455
}
1456

1457
int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
1458
                                int64_t bytes, QEMUIOVector *qiov,
1459
                                BdrvRequestFlags flags)
1460
{
1461
    IO_OR_GS_CODE();
1462
    return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
1463
}
1464

1465
int coroutine_fn blk_co_block_status_above(BlockBackend *blk,
1466
                                           BlockDriverState *base,
1467
                                           int64_t offset, int64_t bytes,
1468
                                           int64_t *pnum, int64_t *map,
1469
                                           BlockDriverState **file)
1470
{
1471
    IO_CODE();
1472
    GRAPH_RDLOCK_GUARD();
1473
    return bdrv_co_block_status_above(blk_bs(blk), base, offset, bytes, pnum,
1474
                                      map, file);
1475
}
1476

1477
int coroutine_fn blk_co_is_allocated_above(BlockBackend *blk,
1478
                                           BlockDriverState *base,
1479
                                           bool include_base, int64_t offset,
1480
                                           int64_t bytes, int64_t *pnum)
1481
{
1482
    IO_CODE();
1483
    GRAPH_RDLOCK_GUARD();
1484
    return bdrv_co_is_allocated_above(blk_bs(blk), base, include_base, offset,
1485
                                      bytes, pnum);
1486
}
1487

1488
typedef struct BlkRwCo {
1489
    BlockBackend *blk;
1490
    int64_t offset;
1491
    void *iobuf;
1492
    int ret;
1493
    BdrvRequestFlags flags;
1494
} BlkRwCo;
1495

1496
int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
1497
{
1498
    GLOBAL_STATE_CODE();
1499
    return bdrv_make_zero(blk->root, flags);
1500
}
1501

1502
void blk_inc_in_flight(BlockBackend *blk)
1503
{
1504
    IO_CODE();
1505
    qatomic_inc(&blk->in_flight);
1506
}
1507

1508
void blk_dec_in_flight(BlockBackend *blk)
1509
{
1510
    IO_CODE();
1511
    qatomic_dec(&blk->in_flight);
1512
    aio_wait_kick();
1513
}
1514

1515
static void error_callback_bh(void *opaque)
1516
{
1517
    struct BlockBackendAIOCB *acb = opaque;
1518

1519
    blk_dec_in_flight(acb->blk);
1520
    acb->common.cb(acb->common.opaque, acb->ret);
1521
    qemu_aio_unref(acb);
1522
}
1523

1524
BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
1525
                                  BlockCompletionFunc *cb,
1526
                                  void *opaque, int ret)
1527
{
1528
    struct BlockBackendAIOCB *acb;
1529
    IO_CODE();
1530

1531
    blk_inc_in_flight(blk);
1532
    acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
1533
    acb->blk = blk;
1534
    acb->ret = ret;
1535

1536
    replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
1537
                                     error_callback_bh, acb);
1538
    return &acb->common;
1539
}
1540

1541
typedef struct BlkAioEmAIOCB {
1542
    BlockAIOCB common;
1543
    BlkRwCo rwco;
1544
    int64_t bytes;
1545
    bool has_returned;
1546
} BlkAioEmAIOCB;
1547

1548
static const AIOCBInfo blk_aio_em_aiocb_info = {
1549
    .aiocb_size         = sizeof(BlkAioEmAIOCB),
1550
};
1551

1552
static void blk_aio_complete(BlkAioEmAIOCB *acb)
1553
{
1554
    if (acb->has_returned) {
1555
        acb->common.cb(acb->common.opaque, acb->rwco.ret);
1556
        blk_dec_in_flight(acb->rwco.blk);
1557
        qemu_aio_unref(acb);
1558
    }
1559
}
1560

1561
static void blk_aio_complete_bh(void *opaque)
1562
{
1563
    BlkAioEmAIOCB *acb = opaque;
1564
    assert(acb->has_returned);
1565
    blk_aio_complete(acb);
1566
}
1567

1568
static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset,
1569
                                int64_t bytes,
1570
                                void *iobuf, CoroutineEntry co_entry,
1571
                                BdrvRequestFlags flags,
1572
                                BlockCompletionFunc *cb, void *opaque)
1573
{
1574
    BlkAioEmAIOCB *acb;
1575
    Coroutine *co;
1576

1577
    blk_inc_in_flight(blk);
1578
    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1579
    acb->rwco = (BlkRwCo) {
1580
        .blk    = blk,
1581
        .offset = offset,
1582
        .iobuf  = iobuf,
1583
        .flags  = flags,
1584
        .ret    = NOT_DONE,
1585
    };
1586
    acb->bytes = bytes;
1587
    acb->has_returned = false;
1588

1589
    co = qemu_coroutine_create(co_entry, acb);
1590
    aio_co_enter(qemu_get_current_aio_context(), co);
1591

1592
    acb->has_returned = true;
1593
    if (acb->rwco.ret != NOT_DONE) {
1594
        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
1595
                                         blk_aio_complete_bh, acb);
1596
    }
1597

1598
    return &acb->common;
1599
}
1600

1601
static void coroutine_fn blk_aio_read_entry(void *opaque)
1602
{
1603
    BlkAioEmAIOCB *acb = opaque;
1604
    BlkRwCo *rwco = &acb->rwco;
1605
    QEMUIOVector *qiov = rwco->iobuf;
1606

1607
    assert(qiov->size == acb->bytes);
1608
    rwco->ret = blk_co_do_preadv_part(rwco->blk, rwco->offset, acb->bytes, qiov,
1609
                                      0, rwco->flags);
1610
    blk_aio_complete(acb);
1611
}
1612

1613
static void coroutine_fn blk_aio_write_entry(void *opaque)
1614
{
1615
    BlkAioEmAIOCB *acb = opaque;
1616
    BlkRwCo *rwco = &acb->rwco;
1617
    QEMUIOVector *qiov = rwco->iobuf;
1618

1619
    assert(!qiov || qiov->size == acb->bytes);
1620
    rwco->ret = blk_co_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes,
1621
                                       qiov, 0, rwco->flags);
1622
    blk_aio_complete(acb);
1623
}
1624

1625
BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1626
                                  int64_t bytes, BdrvRequestFlags flags,
1627
                                  BlockCompletionFunc *cb, void *opaque)
1628
{
1629
    IO_CODE();
1630
    return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_write_entry,
1631
                        flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
1632
}
1633

1634
int64_t coroutine_fn blk_co_getlength(BlockBackend *blk)
1635
{
1636
    IO_CODE();
1637
    GRAPH_RDLOCK_GUARD();
1638

1639
    if (!blk_co_is_available(blk)) {
1640
        return -ENOMEDIUM;
1641
    }
1642

1643
    return bdrv_co_getlength(blk_bs(blk));
1644
}
1645

1646
int64_t coroutine_fn blk_co_nb_sectors(BlockBackend *blk)
1647
{
1648
    BlockDriverState *bs = blk_bs(blk);
1649

1650
    IO_CODE();
1651
    GRAPH_RDLOCK_GUARD();
1652

1653
    if (!bs) {
1654
        return -ENOMEDIUM;
1655
    } else {
1656
        return bdrv_co_nb_sectors(bs);
1657
    }
1658
}
1659

1660
/*
1661
 * This wrapper is written by hand because this function is in the hot I/O path,
1662
 * via blk_get_geometry.
1663
 */
1664
int64_t coroutine_mixed_fn blk_nb_sectors(BlockBackend *blk)
1665
{
1666
    BlockDriverState *bs = blk_bs(blk);
1667

1668
    IO_CODE();
1669

1670
    if (!bs) {
1671
        return -ENOMEDIUM;
1672
    } else {
1673
        return bdrv_nb_sectors(bs);
1674
    }
1675
}
1676

1677
/* return 0 as number of sectors if no device present or error */
1678
void coroutine_fn blk_co_get_geometry(BlockBackend *blk,
1679
                                      uint64_t *nb_sectors_ptr)
1680
{
1681
    int64_t ret = blk_co_nb_sectors(blk);
1682
    *nb_sectors_ptr = ret < 0 ? 0 : ret;
1683
}
1684

1685
/*
1686
 * This wrapper is written by hand because this function is in the hot I/O path.
1687
 */
1688
void coroutine_mixed_fn blk_get_geometry(BlockBackend *blk,
1689
                                         uint64_t *nb_sectors_ptr)
1690
{
1691
    int64_t ret = blk_nb_sectors(blk);
1692
    *nb_sectors_ptr = ret < 0 ? 0 : ret;
1693
}
1694

1695
BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
1696
                           QEMUIOVector *qiov, BdrvRequestFlags flags,
1697
                           BlockCompletionFunc *cb, void *opaque)
1698
{
1699
    IO_CODE();
1700
    assert((uint64_t)qiov->size <= INT64_MAX);
1701
    return blk_aio_prwv(blk, offset, qiov->size, qiov,
1702
                        blk_aio_read_entry, flags, cb, opaque);
1703
}
1704

1705
BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
1706
                            QEMUIOVector *qiov, BdrvRequestFlags flags,
1707
                            BlockCompletionFunc *cb, void *opaque)
1708
{
1709
    IO_CODE();
1710
    assert((uint64_t)qiov->size <= INT64_MAX);
1711
    return blk_aio_prwv(blk, offset, qiov->size, qiov,
1712
                        blk_aio_write_entry, flags, cb, opaque);
1713
}
1714

1715
void blk_aio_cancel(BlockAIOCB *acb)
1716
{
1717
    GLOBAL_STATE_CODE();
1718
    bdrv_aio_cancel(acb);
1719
}
1720

1721
void blk_aio_cancel_async(BlockAIOCB *acb)
1722
{
1723
    IO_CODE();
1724
    bdrv_aio_cancel_async(acb);
1725
}
1726

1727
/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1728
static int coroutine_fn
1729
blk_co_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1730
{
1731
    IO_CODE();
1732

1733
    blk_wait_while_drained(blk);
1734
    GRAPH_RDLOCK_GUARD();
1735

1736
    if (!blk_co_is_available(blk)) {
1737
        return -ENOMEDIUM;
1738
    }
1739

1740
    return bdrv_co_ioctl(blk_bs(blk), req, buf);
1741
}
1742

1743
int coroutine_fn blk_co_ioctl(BlockBackend *blk, unsigned long int req,
1744
                              void *buf)
1745
{
1746
    int ret;
1747
    IO_OR_GS_CODE();
1748

1749
    blk_inc_in_flight(blk);
1750
    ret = blk_co_do_ioctl(blk, req, buf);
1751
    blk_dec_in_flight(blk);
1752

1753
    return ret;
1754
}
1755

1756
static void coroutine_fn blk_aio_ioctl_entry(void *opaque)
1757
{
1758
    BlkAioEmAIOCB *acb = opaque;
1759
    BlkRwCo *rwco = &acb->rwco;
1760

1761
    rwco->ret = blk_co_do_ioctl(rwco->blk, rwco->offset, rwco->iobuf);
1762

1763
    blk_aio_complete(acb);
1764
}
1765

1766
BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
1767
                          BlockCompletionFunc *cb, void *opaque)
1768
{
1769
    IO_CODE();
1770
    return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
1771
}
1772

1773
/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1774
static int coroutine_fn
1775
blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes)
1776
{
1777
    int ret;
1778
    IO_CODE();
1779

1780
    blk_wait_while_drained(blk);
1781
    GRAPH_RDLOCK_GUARD();
1782

1783
    ret = blk_check_byte_request(blk, offset, bytes);
1784
    if (ret < 0) {
1785
        return ret;
1786
    }
1787

1788
    return bdrv_co_pdiscard(blk->root, offset, bytes);
1789
}
1790

1791
static void coroutine_fn blk_aio_pdiscard_entry(void *opaque)
1792
{
1793
    BlkAioEmAIOCB *acb = opaque;
1794
    BlkRwCo *rwco = &acb->rwco;
1795

1796
    rwco->ret = blk_co_do_pdiscard(rwco->blk, rwco->offset, acb->bytes);
1797
    blk_aio_complete(acb);
1798
}
1799

1800
BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
1801
                             int64_t offset, int64_t bytes,
1802
                             BlockCompletionFunc *cb, void *opaque)
1803
{
1804
    IO_CODE();
1805
    return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
1806
                        cb, opaque);
1807
}
1808

1809
int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
1810
                                 int64_t bytes)
1811
{
1812
    int ret;
1813
    IO_OR_GS_CODE();
1814

1815
    blk_inc_in_flight(blk);
1816
    ret = blk_co_do_pdiscard(blk, offset, bytes);
1817
    blk_dec_in_flight(blk);
1818

1819
    return ret;
1820
}
1821

1822
/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1823
static int coroutine_fn blk_co_do_flush(BlockBackend *blk)
1824
{
1825
    IO_CODE();
1826
    blk_wait_while_drained(blk);
1827
    GRAPH_RDLOCK_GUARD();
1828

1829
    if (!blk_co_is_available(blk)) {
1830
        return -ENOMEDIUM;
1831
    }
1832

1833
    return bdrv_co_flush(blk_bs(blk));
1834
}
1835

1836
static void coroutine_fn blk_aio_flush_entry(void *opaque)
1837
{
1838
    BlkAioEmAIOCB *acb = opaque;
1839
    BlkRwCo *rwco = &acb->rwco;
1840

1841
    rwco->ret = blk_co_do_flush(rwco->blk);
1842
    blk_aio_complete(acb);
1843
}
1844

1845
BlockAIOCB *blk_aio_flush(BlockBackend *blk,
1846
                          BlockCompletionFunc *cb, void *opaque)
1847
{
1848
    IO_CODE();
1849
    return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
1850
}
1851

1852
int coroutine_fn blk_co_flush(BlockBackend *blk)
1853
{
1854
    int ret;
1855
    IO_OR_GS_CODE();
1856

1857
    blk_inc_in_flight(blk);
1858
    ret = blk_co_do_flush(blk);
1859
    blk_dec_in_flight(blk);
1860

1861
    return ret;
1862
}
1863

1864
static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
1865
{
1866
    BlkAioEmAIOCB *acb = opaque;
1867
    BlkRwCo *rwco = &acb->rwco;
1868

1869
    rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
1870
                                   (unsigned int*)(uintptr_t)acb->bytes,
1871
                                   rwco->iobuf);
1872
    blk_aio_complete(acb);
1873
}
1874

1875
BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
1876
                                unsigned int *nr_zones,
1877
                                BlockZoneDescriptor  *zones,
1878
                                BlockCompletionFunc *cb, void *opaque)
1879
{
1880
    BlkAioEmAIOCB *acb;
1881
    Coroutine *co;
1882
    IO_CODE();
1883

1884
    blk_inc_in_flight(blk);
1885
    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1886
    acb->rwco = (BlkRwCo) {
1887
        .blk    = blk,
1888
        .offset = offset,
1889
        .iobuf  = zones,
1890
        .ret    = NOT_DONE,
1891
    };
1892
    acb->bytes = (int64_t)(uintptr_t)nr_zones,
1893
    acb->has_returned = false;
1894

1895
    co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
1896
    aio_co_enter(qemu_get_current_aio_context(), co);
1897

1898
    acb->has_returned = true;
1899
    if (acb->rwco.ret != NOT_DONE) {
1900
        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
1901
                                         blk_aio_complete_bh, acb);
1902
    }
1903

1904
    return &acb->common;
1905
}
1906

1907
static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
1908
{
1909
    BlkAioEmAIOCB *acb = opaque;
1910
    BlkRwCo *rwco = &acb->rwco;
1911

1912
    rwco->ret = blk_co_zone_mgmt(rwco->blk,
1913
                                 (BlockZoneOp)(uintptr_t)rwco->iobuf,
1914
                                 rwco->offset, acb->bytes);
1915
    blk_aio_complete(acb);
1916
}
1917

1918
BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
1919
                              int64_t offset, int64_t len,
1920
                              BlockCompletionFunc *cb, void *opaque) {
1921
    BlkAioEmAIOCB *acb;
1922
    Coroutine *co;
1923
    IO_CODE();
1924

1925
    blk_inc_in_flight(blk);
1926
    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1927
    acb->rwco = (BlkRwCo) {
1928
        .blk    = blk,
1929
        .offset = offset,
1930
        .iobuf  = (void *)(uintptr_t)op,
1931
        .ret    = NOT_DONE,
1932
    };
1933
    acb->bytes = len;
1934
    acb->has_returned = false;
1935

1936
    co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
1937
    aio_co_enter(qemu_get_current_aio_context(), co);
1938

1939
    acb->has_returned = true;
1940
    if (acb->rwco.ret != NOT_DONE) {
1941
        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
1942
                                         blk_aio_complete_bh, acb);
1943
    }
1944

1945
    return &acb->common;
1946
}
1947

1948
static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
1949
{
1950
    BlkAioEmAIOCB *acb = opaque;
1951
    BlkRwCo *rwco = &acb->rwco;
1952

1953
    rwco->ret = blk_co_zone_append(rwco->blk, (int64_t *)(uintptr_t)acb->bytes,
1954
                                   rwco->iobuf, rwco->flags);
1955
    blk_aio_complete(acb);
1956
}
1957

1958
BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
1959
                                QEMUIOVector *qiov, BdrvRequestFlags flags,
1960
                                BlockCompletionFunc *cb, void *opaque) {
1961
    BlkAioEmAIOCB *acb;
1962
    Coroutine *co;
1963
    IO_CODE();
1964

1965
    blk_inc_in_flight(blk);
1966
    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1967
    acb->rwco = (BlkRwCo) {
1968
        .blk    = blk,
1969
        .ret    = NOT_DONE,
1970
        .flags  = flags,
1971
        .iobuf  = qiov,
1972
    };
1973
    acb->bytes = (int64_t)(uintptr_t)offset;
1974
    acb->has_returned = false;
1975

1976
    co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
1977
    aio_co_enter(qemu_get_current_aio_context(), co);
1978
    acb->has_returned = true;
1979
    if (acb->rwco.ret != NOT_DONE) {
1980
        replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(),
1981
                                         blk_aio_complete_bh, acb);
1982
    }
1983

1984
    return &acb->common;
1985
}
1986

1987
/*
1988
 * Send a zone_report command.
1989
 * offset is a byte offset from the start of the device. No alignment
1990
 * required for offset.
1991
 * nr_zones represents IN maximum and OUT actual.
1992
 */
1993
int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
1994
                                    unsigned int *nr_zones,
1995
                                    BlockZoneDescriptor *zones)
1996
{
1997
    int ret;
1998
    IO_CODE();
1999

2000
    blk_inc_in_flight(blk); /* increase before waiting */
2001
    blk_wait_while_drained(blk);
2002
    GRAPH_RDLOCK_GUARD();
2003
    if (!blk_is_available(blk)) {
2004
        blk_dec_in_flight(blk);
2005
        return -ENOMEDIUM;
2006
    }
2007
    ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
2008
    blk_dec_in_flight(blk);
2009
    return ret;
2010
}
2011

2012
/*
2013
 * Send a zone_management command.
2014
 * op is the zone operation;
2015
 * offset is the byte offset from the start of the zoned device;
2016
 * len is the maximum number of bytes the command should operate on. It
2017
 * should be aligned with the device zone size.
2018
 */
2019
int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
2020
        int64_t offset, int64_t len)
2021
{
2022
    int ret;
2023
    IO_CODE();
2024

2025
    blk_inc_in_flight(blk);
2026
    blk_wait_while_drained(blk);
2027
    GRAPH_RDLOCK_GUARD();
2028

2029
    ret = blk_check_byte_request(blk, offset, len);
2030
    if (ret < 0) {
2031
        blk_dec_in_flight(blk);
2032
        return ret;
2033
    }
2034

2035
    ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
2036
    blk_dec_in_flight(blk);
2037
    return ret;
2038
}
2039

2040
/*
2041
 * Send a zone_append command.
2042
 */
2043
int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
2044
        QEMUIOVector *qiov, BdrvRequestFlags flags)
2045
{
2046
    int ret;
2047
    IO_CODE();
2048

2049
    blk_inc_in_flight(blk);
2050
    blk_wait_while_drained(blk);
2051
    GRAPH_RDLOCK_GUARD();
2052
    if (!blk_is_available(blk)) {
2053
        blk_dec_in_flight(blk);
2054
        return -ENOMEDIUM;
2055
    }
2056

2057
    ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
2058
    blk_dec_in_flight(blk);
2059
    return ret;
2060
}
2061

2062
void blk_drain(BlockBackend *blk)
2063
{
2064
    BlockDriverState *bs = blk_bs(blk);
2065
    GLOBAL_STATE_CODE();
2066

2067
    if (bs) {
2068
        bdrv_ref(bs);
2069
        bdrv_drained_begin(bs);
2070
    }
2071

2072
    /* We may have -ENOMEDIUM completions in flight */
2073
    AIO_WAIT_WHILE(blk_get_aio_context(blk),
2074
                   qatomic_read(&blk->in_flight) > 0);
2075

2076
    if (bs) {
2077
        bdrv_drained_end(bs);
2078
        bdrv_unref(bs);
2079
    }
2080
}
2081

2082
void blk_drain_all(void)
2083
{
2084
    BlockBackend *blk = NULL;
2085

2086
    GLOBAL_STATE_CODE();
2087

2088
    bdrv_drain_all_begin();
2089

2090
    while ((blk = blk_all_next(blk)) != NULL) {
2091
        /* We may have -ENOMEDIUM completions in flight */
2092
        AIO_WAIT_WHILE_UNLOCKED(NULL, qatomic_read(&blk->in_flight) > 0);
2093
    }
2094

2095
    bdrv_drain_all_end();
2096
}
2097

2098
void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
2099
                      BlockdevOnError on_write_error)
2100
{
2101
    GLOBAL_STATE_CODE();
2102
    blk->on_read_error = on_read_error;
2103
    blk->on_write_error = on_write_error;
2104
}
2105

2106
BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
2107
{
2108
    IO_CODE();
2109
    return is_read ? blk->on_read_error : blk->on_write_error;
2110
}
2111

2112
BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
2113
                                      int error)
2114
{
2115
    BlockdevOnError on_err = blk_get_on_error(blk, is_read);
2116
    IO_CODE();
2117

2118
    switch (on_err) {
2119
    case BLOCKDEV_ON_ERROR_ENOSPC:
2120
        return (error == ENOSPC) ?
2121
               BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
2122
    case BLOCKDEV_ON_ERROR_STOP:
2123
        return BLOCK_ERROR_ACTION_STOP;
2124
    case BLOCKDEV_ON_ERROR_REPORT:
2125
        return BLOCK_ERROR_ACTION_REPORT;
2126
    case BLOCKDEV_ON_ERROR_IGNORE:
2127
        return BLOCK_ERROR_ACTION_IGNORE;
2128
    case BLOCKDEV_ON_ERROR_AUTO:
2129
    default:
2130
        abort();
2131
    }
2132
}
2133

2134
static void send_qmp_error_event(BlockBackend *blk,
2135
                                 BlockErrorAction action,
2136
                                 bool is_read, int error)
2137
{
2138
    IoOperationType optype;
2139
    BlockDriverState *bs = blk_bs(blk);
2140

2141
    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
2142
    qapi_event_send_block_io_error(blk_name(blk),
2143
                                   bs ? bdrv_get_node_name(bs) : NULL, optype,
2144
                                   action, blk_iostatus_is_enabled(blk),
2145
                                   error == ENOSPC, strerror(error));
2146
}
2147

2148
/* This is done by device models because, while the block layer knows
2149
 * about the error, it does not know whether an operation comes from
2150
 * the device or the block layer (from a job, for example).
2151
 */
2152
void blk_error_action(BlockBackend *blk, BlockErrorAction action,
2153
                      bool is_read, int error)
2154
{
2155
    assert(error >= 0);
2156
    IO_CODE();
2157

2158
    if (action == BLOCK_ERROR_ACTION_STOP) {
2159
        /* First set the iostatus, so that "info block" returns an iostatus
2160
         * that matches the events raised so far (an additional error iostatus
2161
         * is fine, but not a lost one).
2162
         */
2163
        blk_iostatus_set_err(blk, error);
2164

2165
        /* Then raise the request to stop the VM and the event.
2166
         * qemu_system_vmstop_request_prepare has two effects.  First,
2167
         * it ensures that the STOP event always comes after the
2168
         * BLOCK_IO_ERROR event.  Second, it ensures that even if management
2169
         * can observe the STOP event and do a "cont" before the STOP
2170
         * event is issued, the VM will not stop.  In this case, vm_start()
2171
         * also ensures that the STOP/RESUME pair of events is emitted.
2172
         */
2173
        qemu_system_vmstop_request_prepare();
2174
        send_qmp_error_event(blk, action, is_read, error);
2175
        qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
2176
    } else {
2177
        send_qmp_error_event(blk, action, is_read, error);
2178
    }
2179
}
2180

2181
/*
2182
 * Returns true if the BlockBackend can support taking write permissions
2183
 * (because its root node is not read-only).
2184
 */
2185
bool blk_supports_write_perm(BlockBackend *blk)
2186
{
2187
    BlockDriverState *bs = blk_bs(blk);
2188
    GLOBAL_STATE_CODE();
2189

2190
    if (bs) {
2191
        return !bdrv_is_read_only(bs);
2192
    } else {
2193
        return blk->root_state.open_flags & BDRV_O_RDWR;
2194
    }
2195
}
2196

2197
/*
2198
 * Returns true if the BlockBackend can be written to in its current
2199
 * configuration (i.e. if write permission have been requested)
2200
 */
2201
bool blk_is_writable(BlockBackend *blk)
2202
{
2203
    IO_CODE();
2204
    return blk->perm & BLK_PERM_WRITE;
2205
}
2206

2207
bool blk_is_sg(BlockBackend *blk)
2208
{
2209
    BlockDriverState *bs = blk_bs(blk);
2210
    GLOBAL_STATE_CODE();
2211

2212
    if (!bs) {
2213
        return false;
2214
    }
2215

2216
    return bdrv_is_sg(bs);
2217
}
2218

2219
bool blk_enable_write_cache(BlockBackend *blk)
2220
{
2221
    IO_CODE();
2222
    return blk->enable_write_cache;
2223
}
2224

2225
void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
2226
{
2227
    IO_CODE();
2228
    blk->enable_write_cache = wce;
2229
}
2230

2231
void blk_activate(BlockBackend *blk, Error **errp)
2232
{
2233
    BlockDriverState *bs = blk_bs(blk);
2234
    GLOBAL_STATE_CODE();
2235

2236
    if (!bs) {
2237
        error_setg(errp, "Device '%s' has no medium", blk->name);
2238
        return;
2239
    }
2240

2241
    /*
2242
     * Migration code can call this function in coroutine context, so leave
2243
     * coroutine context if necessary.
2244
     */
2245
    if (qemu_in_coroutine()) {
2246
        bdrv_co_activate(bs, errp);
2247
    } else {
2248
        GRAPH_RDLOCK_GUARD_MAINLOOP();
2249
        bdrv_activate(bs, errp);
2250
    }
2251
}
2252

2253
bool coroutine_fn blk_co_is_inserted(BlockBackend *blk)
2254
{
2255
    BlockDriverState *bs = blk_bs(blk);
2256
    IO_CODE();
2257
    assert_bdrv_graph_readable();
2258

2259
    return bs && bdrv_co_is_inserted(bs);
2260
}
2261

2262
bool coroutine_fn blk_co_is_available(BlockBackend *blk)
2263
{
2264
    IO_CODE();
2265
    return blk_co_is_inserted(blk) && !blk_dev_is_tray_open(blk);
2266
}
2267

2268
void coroutine_fn blk_co_lock_medium(BlockBackend *blk, bool locked)
2269
{
2270
    BlockDriverState *bs = blk_bs(blk);
2271
    IO_CODE();
2272
    GRAPH_RDLOCK_GUARD();
2273

2274
    if (bs) {
2275
        bdrv_co_lock_medium(bs, locked);
2276
    }
2277
}
2278

2279
void coroutine_fn blk_co_eject(BlockBackend *blk, bool eject_flag)
2280
{
2281
    BlockDriverState *bs = blk_bs(blk);
2282
    char *id;
2283
    IO_CODE();
2284
    GRAPH_RDLOCK_GUARD();
2285

2286
    if (bs) {
2287
        bdrv_co_eject(bs, eject_flag);
2288
    }
2289

2290
    /* Whether or not we ejected on the backend,
2291
     * the frontend experienced a tray event. */
2292
    id = blk_get_attached_dev_id(blk);
2293
    qapi_event_send_device_tray_moved(blk_name(blk), id,
2294
                                      eject_flag);
2295
    g_free(id);
2296
}
2297

2298
int blk_get_flags(BlockBackend *blk)
2299
{
2300
    BlockDriverState *bs = blk_bs(blk);
2301
    GLOBAL_STATE_CODE();
2302

2303
    if (bs) {
2304
        return bdrv_get_flags(bs);
2305
    } else {
2306
        return blk->root_state.open_flags;
2307
    }
2308
}
2309

2310
/* Returns the minimum request alignment, in bytes; guaranteed nonzero */
2311
uint32_t blk_get_request_alignment(BlockBackend *blk)
2312
{
2313
    BlockDriverState *bs = blk_bs(blk);
2314
    IO_CODE();
2315
    return bs ? bs->bl.request_alignment : BDRV_SECTOR_SIZE;
2316
}
2317

2318
/* Returns the maximum hardware transfer length, in bytes; guaranteed nonzero */
2319
uint64_t blk_get_max_hw_transfer(BlockBackend *blk)
2320
{
2321
    BlockDriverState *bs = blk_bs(blk);
2322
    uint64_t max = INT_MAX;
2323
    IO_CODE();
2324

2325
    if (bs) {
2326
        max = MIN_NON_ZERO(max, bs->bl.max_hw_transfer);
2327
        max = MIN_NON_ZERO(max, bs->bl.max_transfer);
2328
    }
2329
    return ROUND_DOWN(max, blk_get_request_alignment(blk));
2330
}
2331

2332
/* Returns the maximum transfer length, in bytes; guaranteed nonzero */
2333
uint32_t blk_get_max_transfer(BlockBackend *blk)
2334
{
2335
    BlockDriverState *bs = blk_bs(blk);
2336
    uint32_t max = INT_MAX;
2337
    IO_CODE();
2338

2339
    if (bs) {
2340
        max = MIN_NON_ZERO(max, bs->bl.max_transfer);
2341
    }
2342
    return ROUND_DOWN(max, blk_get_request_alignment(blk));
2343
}
2344

2345
int blk_get_max_hw_iov(BlockBackend *blk)
2346
{
2347
    IO_CODE();
2348
    return MIN_NON_ZERO(blk->root->bs->bl.max_hw_iov,
2349
                        blk->root->bs->bl.max_iov);
2350
}
2351

2352
int blk_get_max_iov(BlockBackend *blk)
2353
{
2354
    IO_CODE();
2355
    return blk->root->bs->bl.max_iov;
2356
}
2357

2358
void *blk_try_blockalign(BlockBackend *blk, size_t size)
2359
{
2360
    IO_CODE();
2361
    return qemu_try_blockalign(blk ? blk_bs(blk) : NULL, size);
2362
}
2363

2364
void *blk_blockalign(BlockBackend *blk, size_t size)
2365
{
2366
    IO_CODE();
2367
    return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
2368
}
2369

2370
bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
2371
{
2372
    BlockDriverState *bs = blk_bs(blk);
2373
    GLOBAL_STATE_CODE();
2374
    GRAPH_RDLOCK_GUARD_MAINLOOP();
2375

2376
    if (!bs) {
2377
        return false;
2378
    }
2379

2380
    return bdrv_op_is_blocked(bs, op, errp);
2381
}
2382

2383
void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
2384
{
2385
    BlockDriverState *bs = blk_bs(blk);
2386
    GLOBAL_STATE_CODE();
2387

2388
    if (bs) {
2389
        bdrv_op_unblock(bs, op, reason);
2390
    }
2391
}
2392

2393
void blk_op_block_all(BlockBackend *blk, Error *reason)
2394
{
2395
    BlockDriverState *bs = blk_bs(blk);
2396
    GLOBAL_STATE_CODE();
2397

2398
    if (bs) {
2399
        bdrv_op_block_all(bs, reason);
2400
    }
2401
}
2402

2403
void blk_op_unblock_all(BlockBackend *blk, Error *reason)
2404
{
2405
    BlockDriverState *bs = blk_bs(blk);
2406
    GLOBAL_STATE_CODE();
2407

2408
    if (bs) {
2409
        bdrv_op_unblock_all(bs, reason);
2410
    }
2411
}
2412

2413
/**
2414
 * Return BB's current AioContext.  Note that this context may change
2415
 * concurrently at any time, with one exception: If the BB has a root node
2416
 * attached, its context will only change through bdrv_try_change_aio_context(),
2417
 * which creates a drained section.  Therefore, incrementing such a BB's
2418
 * in-flight counter will prevent its context from changing.
2419
 */
2420
AioContext *blk_get_aio_context(BlockBackend *blk)
2421
{
2422
    IO_CODE();
2423

2424
    if (!blk) {
2425
        return qemu_get_aio_context();
2426
    }
2427

2428
    return qatomic_read(&blk->ctx);
2429
}
2430

2431
int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
2432
                        Error **errp)
2433
{
2434
    bool old_allow_change;
2435
    BlockDriverState *bs = blk_bs(blk);
2436
    int ret;
2437

2438
    GLOBAL_STATE_CODE();
2439

2440
    if (!bs) {
2441
        qatomic_set(&blk->ctx, new_context);
2442
        return 0;
2443
    }
2444

2445
    bdrv_ref(bs);
2446

2447
    old_allow_change = blk->allow_aio_context_change;
2448
    blk->allow_aio_context_change = true;
2449

2450
    ret = bdrv_try_change_aio_context(bs, new_context, NULL, errp);
2451

2452
    blk->allow_aio_context_change = old_allow_change;
2453

2454
    bdrv_unref(bs);
2455
    return ret;
2456
}
2457

2458
typedef struct BdrvStateBlkRootContext {
2459
    AioContext *new_ctx;
2460
    BlockBackend *blk;
2461
} BdrvStateBlkRootContext;
2462

2463
static void blk_root_set_aio_ctx_commit(void *opaque)
2464
{
2465
    BdrvStateBlkRootContext *s = opaque;
2466
    BlockBackend *blk = s->blk;
2467
    AioContext *new_context = s->new_ctx;
2468
    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2469

2470
    qatomic_set(&blk->ctx, new_context);
2471
    if (tgm->throttle_state) {
2472
        throttle_group_detach_aio_context(tgm);
2473
        throttle_group_attach_aio_context(tgm, new_context);
2474
    }
2475
}
2476

2477
static TransactionActionDrv set_blk_root_context = {
2478
    .commit = blk_root_set_aio_ctx_commit,
2479
    .clean = g_free,
2480
};
2481

2482
static bool blk_root_change_aio_ctx(BdrvChild *child, AioContext *ctx,
2483
                                    GHashTable *visited, Transaction *tran,
2484
                                    Error **errp)
2485
{
2486
    BlockBackend *blk = child->opaque;
2487
    BdrvStateBlkRootContext *s;
2488

2489
    if (!blk->allow_aio_context_change) {
2490
        /*
2491
         * Manually created BlockBackends (those with a name) that are not
2492
         * attached to anything can change their AioContext without updating
2493
         * their user; return an error for others.
2494
         */
2495
        if (!blk->name || blk->dev) {
2496
            /* TODO Add BB name/QOM path */
2497
            error_setg(errp, "Cannot change iothread of active block backend");
2498
            return false;
2499
        }
2500
    }
2501

2502
    s = g_new(BdrvStateBlkRootContext, 1);
2503
    *s = (BdrvStateBlkRootContext) {
2504
        .new_ctx = ctx,
2505
        .blk = blk,
2506
    };
2507

2508
    tran_add(tran, &set_blk_root_context, s);
2509
    return true;
2510
}
2511

2512
void blk_add_aio_context_notifier(BlockBackend *blk,
2513
        void (*attached_aio_context)(AioContext *new_context, void *opaque),
2514
        void (*detach_aio_context)(void *opaque), void *opaque)
2515
{
2516
    BlockBackendAioNotifier *notifier;
2517
    BlockDriverState *bs = blk_bs(blk);
2518
    GLOBAL_STATE_CODE();
2519

2520
    notifier = g_new(BlockBackendAioNotifier, 1);
2521
    notifier->attached_aio_context = attached_aio_context;
2522
    notifier->detach_aio_context = detach_aio_context;
2523
    notifier->opaque = opaque;
2524
    QLIST_INSERT_HEAD(&blk->aio_notifiers, notifier, list);
2525

2526
    if (bs) {
2527
        bdrv_add_aio_context_notifier(bs, attached_aio_context,
2528
                                      detach_aio_context, opaque);
2529
    }
2530
}
2531

2532
void blk_remove_aio_context_notifier(BlockBackend *blk,
2533
                                     void (*attached_aio_context)(AioContext *,
2534
                                                                  void *),
2535
                                     void (*detach_aio_context)(void *),
2536
                                     void *opaque)
2537
{
2538
    BlockBackendAioNotifier *notifier;
2539
    BlockDriverState *bs = blk_bs(blk);
2540

2541
    GLOBAL_STATE_CODE();
2542

2543
    if (bs) {
2544
        bdrv_remove_aio_context_notifier(bs, attached_aio_context,
2545
                                         detach_aio_context, opaque);
2546
    }
2547

2548
    QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
2549
        if (notifier->attached_aio_context == attached_aio_context &&
2550
            notifier->detach_aio_context == detach_aio_context &&
2551
            notifier->opaque == opaque) {
2552
            QLIST_REMOVE(notifier, list);
2553
            g_free(notifier);
2554
            return;
2555
        }
2556
    }
2557

2558
    abort();
2559
}
2560

2561
void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify)
2562
{
2563
    GLOBAL_STATE_CODE();
2564
    notifier_list_add(&blk->remove_bs_notifiers, notify);
2565
}
2566

2567
void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
2568
{
2569
    GLOBAL_STATE_CODE();
2570
    notifier_list_add(&blk->insert_bs_notifiers, notify);
2571
}
2572

2573
BlockAcctStats *blk_get_stats(BlockBackend *blk)
2574
{
2575
    IO_CODE();
2576
    return &blk->stats;
2577
}
2578

2579
void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
2580
                  BlockCompletionFunc *cb, void *opaque)
2581
{
2582
    IO_CODE();
2583
    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
2584
}
2585

2586
int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
2587
                                      int64_t bytes, BdrvRequestFlags flags)
2588
{
2589
    IO_OR_GS_CODE();
2590
    return blk_co_pwritev(blk, offset, bytes, NULL,
2591
                          flags | BDRV_REQ_ZERO_WRITE);
2592
}
2593

2594
int coroutine_fn blk_co_pwrite_compressed(BlockBackend *blk, int64_t offset,
2595
                                          int64_t bytes, const void *buf)
2596
{
2597
    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
2598
    IO_OR_GS_CODE();
2599
    return blk_co_pwritev_part(blk, offset, bytes, &qiov, 0,
2600
                               BDRV_REQ_WRITE_COMPRESSED);
2601
}
2602

2603
int coroutine_fn blk_co_truncate(BlockBackend *blk, int64_t offset, bool exact,
2604
                                 PreallocMode prealloc, BdrvRequestFlags flags,
2605
                                 Error **errp)
2606
{
2607
    IO_OR_GS_CODE();
2608
    GRAPH_RDLOCK_GUARD();
2609
    if (!blk_co_is_available(blk)) {
2610
        error_setg(errp, "No medium inserted");
2611
        return -ENOMEDIUM;
2612
    }
2613

2614
    return bdrv_co_truncate(blk->root, offset, exact, prealloc, flags, errp);
2615
}
2616

2617
int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
2618
                     int64_t pos, int size)
2619
{
2620
    int ret;
2621
    GLOBAL_STATE_CODE();
2622

2623
    if (!blk_is_available(blk)) {
2624
        return -ENOMEDIUM;
2625
    }
2626

2627
    ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size);
2628
    if (ret < 0) {
2629
        return ret;
2630
    }
2631

2632
    if (ret == size && !blk->enable_write_cache) {
2633
        ret = bdrv_flush(blk_bs(blk));
2634
    }
2635

2636
    return ret < 0 ? ret : size;
2637
}
2638

2639
int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
2640
{
2641
    GLOBAL_STATE_CODE();
2642
    if (!blk_is_available(blk)) {
2643
        return -ENOMEDIUM;
2644
    }
2645

2646
    return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
2647
}
2648

2649
int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
2650
{
2651
    GLOBAL_STATE_CODE();
2652
    GRAPH_RDLOCK_GUARD_MAINLOOP();
2653

2654
    if (!blk_is_available(blk)) {
2655
        return -ENOMEDIUM;
2656
    }
2657

2658
    return bdrv_probe_blocksizes(blk_bs(blk), bsz);
2659
}
2660

2661
int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
2662
{
2663
    GLOBAL_STATE_CODE();
2664
    if (!blk_is_available(blk)) {
2665
        return -ENOMEDIUM;
2666
    }
2667

2668
    return bdrv_probe_geometry(blk_bs(blk), geo);
2669
}
2670

2671
/*
2672
 * Updates the BlockBackendRootState object with data from the currently
2673
 * attached BlockDriverState.
2674
 */
2675
void blk_update_root_state(BlockBackend *blk)
2676
{
2677
    GLOBAL_STATE_CODE();
2678
    assert(blk->root);
2679

2680
    blk->root_state.open_flags    = blk->root->bs->open_flags;
2681
    blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
2682
}
2683

2684
/*
2685
 * Returns the detect-zeroes setting to be used for bdrv_open() of a
2686
 * BlockDriverState which is supposed to inherit the root state.
2687
 */
2688
bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk)
2689
{
2690
    GLOBAL_STATE_CODE();
2691
    return blk->root_state.detect_zeroes;
2692
}
2693

2694
/*
2695
 * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
2696
 * supposed to inherit the root state.
2697
 */
2698
int blk_get_open_flags_from_root_state(BlockBackend *blk)
2699
{
2700
    GLOBAL_STATE_CODE();
2701
    return blk->root_state.open_flags;
2702
}
2703

2704
BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
2705
{
2706
    GLOBAL_STATE_CODE();
2707
    return &blk->root_state;
2708
}
2709

2710
int blk_commit_all(void)
2711
{
2712
    BlockBackend *blk = NULL;
2713
    GLOBAL_STATE_CODE();
2714
    GRAPH_RDLOCK_GUARD_MAINLOOP();
2715

2716
    while ((blk = blk_all_next(blk)) != NULL) {
2717
        BlockDriverState *unfiltered_bs = bdrv_skip_filters(blk_bs(blk));
2718

2719
        if (blk_is_inserted(blk) && bdrv_cow_child(unfiltered_bs)) {
2720
            int ret;
2721

2722
            ret = bdrv_commit(unfiltered_bs);
2723
            if (ret < 0) {
2724
                return ret;
2725
            }
2726
        }
2727
    }
2728
    return 0;
2729
}
2730

2731

2732
/* throttling disk I/O limits */
2733
void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
2734
{
2735
    GLOBAL_STATE_CODE();
2736
    throttle_group_config(&blk->public.throttle_group_member, cfg);
2737
}
2738

2739
void blk_io_limits_disable(BlockBackend *blk)
2740
{
2741
    BlockDriverState *bs = blk_bs(blk);
2742
    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2743
    assert(tgm->throttle_state);
2744
    GLOBAL_STATE_CODE();
2745
    if (bs) {
2746
        bdrv_ref(bs);
2747
        bdrv_drained_begin(bs);
2748
    }
2749
    throttle_group_unregister_tgm(tgm);
2750
    if (bs) {
2751
        bdrv_drained_end(bs);
2752
        bdrv_unref(bs);
2753
    }
2754
}
2755

2756
/* should be called before blk_set_io_limits if a limit is set */
2757
void blk_io_limits_enable(BlockBackend *blk, const char *group)
2758
{
2759
    assert(!blk->public.throttle_group_member.throttle_state);
2760
    GLOBAL_STATE_CODE();
2761
    throttle_group_register_tgm(&blk->public.throttle_group_member,
2762
                                group, blk_get_aio_context(blk));
2763
}
2764

2765
void blk_io_limits_update_group(BlockBackend *blk, const char *group)
2766
{
2767
    GLOBAL_STATE_CODE();
2768
    /* this BB is not part of any group */
2769
    if (!blk->public.throttle_group_member.throttle_state) {
2770
        return;
2771
    }
2772

2773
    /* this BB is a part of the same group than the one we want */
2774
    if (!g_strcmp0(throttle_group_get_name(&blk->public.throttle_group_member),
2775
                group)) {
2776
        return;
2777
    }
2778

2779
    /* need to change the group this bs belong to */
2780
    blk_io_limits_disable(blk);
2781
    blk_io_limits_enable(blk, group);
2782
}
2783

2784
static void blk_root_drained_begin(BdrvChild *child)
2785
{
2786
    BlockBackend *blk = child->opaque;
2787
    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2788

2789
    if (qatomic_fetch_inc(&blk->quiesce_counter) == 0) {
2790
        if (blk->dev_ops && blk->dev_ops->drained_begin) {
2791
            blk->dev_ops->drained_begin(blk->dev_opaque);
2792
        }
2793
    }
2794

2795
    /* Note that blk->root may not be accessible here yet if we are just
2796
     * attaching to a BlockDriverState that is drained. Use child instead. */
2797

2798
    if (qatomic_fetch_inc(&tgm->io_limits_disabled) == 0) {
2799
        throttle_group_restart_tgm(tgm);
2800
    }
2801
}
2802

2803
static bool blk_root_drained_poll(BdrvChild *child)
2804
{
2805
    BlockBackend *blk = child->opaque;
2806
    bool busy = false;
2807
    assert(qatomic_read(&blk->quiesce_counter));
2808

2809
    if (blk->dev_ops && blk->dev_ops->drained_poll) {
2810
        busy = blk->dev_ops->drained_poll(blk->dev_opaque);
2811
    }
2812
    return busy || !!blk->in_flight;
2813
}
2814

2815
static void blk_root_drained_end(BdrvChild *child)
2816
{
2817
    BlockBackend *blk = child->opaque;
2818
    assert(qatomic_read(&blk->quiesce_counter));
2819

2820
    assert(blk->public.throttle_group_member.io_limits_disabled);
2821
    qatomic_dec(&blk->public.throttle_group_member.io_limits_disabled);
2822

2823
    if (qatomic_fetch_dec(&blk->quiesce_counter) == 1) {
2824
        if (blk->dev_ops && blk->dev_ops->drained_end) {
2825
            blk->dev_ops->drained_end(blk->dev_opaque);
2826
        }
2827
        qemu_mutex_lock(&blk->queued_requests_lock);
2828
        while (qemu_co_enter_next(&blk->queued_requests,
2829
                                  &blk->queued_requests_lock)) {
2830
            /* Resume all queued requests */
2831
        }
2832
        qemu_mutex_unlock(&blk->queued_requests_lock);
2833
    }
2834
}
2835

2836
bool blk_register_buf(BlockBackend *blk, void *host, size_t size, Error **errp)
2837
{
2838
    BlockDriverState *bs = blk_bs(blk);
2839

2840
    GLOBAL_STATE_CODE();
2841

2842
    if (bs) {
2843
        return bdrv_register_buf(bs, host, size, errp);
2844
    }
2845
    return true;
2846
}
2847

2848
void blk_unregister_buf(BlockBackend *blk, void *host, size_t size)
2849
{
2850
    BlockDriverState *bs = blk_bs(blk);
2851

2852
    GLOBAL_STATE_CODE();
2853

2854
    if (bs) {
2855
        bdrv_unregister_buf(bs, host, size);
2856
    }
2857
}
2858

2859
int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
2860
                                   BlockBackend *blk_out, int64_t off_out,
2861
                                   int64_t bytes, BdrvRequestFlags read_flags,
2862
                                   BdrvRequestFlags write_flags)
2863
{
2864
    int r;
2865
    IO_CODE();
2866
    GRAPH_RDLOCK_GUARD();
2867

2868
    r = blk_check_byte_request(blk_in, off_in, bytes);
2869
    if (r) {
2870
        return r;
2871
    }
2872
    r = blk_check_byte_request(blk_out, off_out, bytes);
2873
    if (r) {
2874
        return r;
2875
    }
2876

2877
    return bdrv_co_copy_range(blk_in->root, off_in,
2878
                              blk_out->root, off_out,
2879
                              bytes, read_flags, write_flags);
2880
}
2881

2882
const BdrvChild *blk_root(BlockBackend *blk)
2883
{
2884
    GLOBAL_STATE_CODE();
2885
    return blk->root;
2886
}
2887

2888
int blk_make_empty(BlockBackend *blk, Error **errp)
2889
{
2890
    GLOBAL_STATE_CODE();
2891
    GRAPH_RDLOCK_GUARD_MAINLOOP();
2892

2893
    if (!blk_is_available(blk)) {
2894
        error_setg(errp, "No medium inserted");
2895
        return -ENOMEDIUM;
2896
    }
2897

2898
    return bdrv_make_empty(blk->root, errp);
2899
}
2900

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.