15
#include "qemu/osdep.h"
16
#include "qemu/module.h"
17
#include "qemu/option.h"
19
#include "block/blockjob.h"
20
#include "block/block_int.h"
21
#include "block/block_backup.h"
22
#include "sysemu/block-backend.h"
23
#include "qapi/error.h"
24
#include "qapi/qmp/qdict.h"
25
#include "block/replication.h"
28
BLOCK_REPLICATION_NONE,
29
BLOCK_REPLICATION_RUNNING,
30
BLOCK_REPLICATION_FAILOVER,
31
BLOCK_REPLICATION_FAILOVER_FAILED,
32
BLOCK_REPLICATION_DONE,
35
typedef struct BDRVReplicationState {
37
ReplicationStage stage;
39
BdrvChild *hidden_disk;
40
BdrvChild *secondary_disk;
45
bool orig_hidden_read_only;
46
bool orig_secondary_read_only;
48
} BDRVReplicationState;
50
static void replication_start(ReplicationState *rs, ReplicationMode mode,
52
static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
53
static void replication_get_error(ReplicationState *rs, Error **errp);
54
static void replication_stop(ReplicationState *rs, bool failover,
57
#define REPLICATION_MODE "mode"
58
#define REPLICATION_TOP_ID "top-id"
59
static QemuOptsList replication_runtime_opts = {
60
.name = "replication",
61
.head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
64
.name = REPLICATION_MODE,
65
.type = QEMU_OPT_STRING,
68
.name = REPLICATION_TOP_ID,
69
.type = QEMU_OPT_STRING,
75
static ReplicationOps replication_ops = {
76
.start = replication_start,
77
.checkpoint = replication_do_checkpoint,
78
.get_error = replication_get_error,
79
.stop = replication_stop,
82
static int replication_open(BlockDriverState *bs, QDict *options,
83
int flags, Error **errp)
86
BDRVReplicationState *s = bs->opaque;
87
QemuOpts *opts = NULL;
91
ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
97
opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
98
if (!qemu_opts_absorb_qdict(opts, options, errp)) {
102
mode = qemu_opt_get(opts, REPLICATION_MODE);
104
error_setg(errp, "Missing the option mode");
108
if (!strcmp(mode, "primary")) {
109
s->mode = REPLICATION_MODE_PRIMARY;
110
top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
113
"The primary side does not support option top-id");
116
} else if (!strcmp(mode, "secondary")) {
117
s->mode = REPLICATION_MODE_SECONDARY;
118
top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
119
s->top_id = g_strdup(top_id);
121
error_setg(errp, "Missing the option top-id");
126
"The option mode's value should be primary or secondary");
130
s->rs = replication_new(bs, &replication_ops);
139
static void replication_close(BlockDriverState *bs)
141
BDRVReplicationState *s = bs->opaque;
145
if (s->stage == BLOCK_REPLICATION_RUNNING) {
146
replication_stop(s->rs, false, NULL);
148
if (s->stage == BLOCK_REPLICATION_FAILOVER) {
149
commit_job = &s->commit_job->job;
150
assert(commit_job->aio_context == qemu_get_current_aio_context());
151
job_cancel_sync(commit_job, false);
154
if (s->mode == REPLICATION_MODE_SECONDARY) {
158
replication_remove(s->rs);
161
static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
163
BlockReopenQueue *reopen_queue,
164
uint64_t perm, uint64_t shared,
165
uint64_t *nperm, uint64_t *nshared)
167
if (role & BDRV_CHILD_PRIMARY) {
168
*nperm = BLK_PERM_CONSISTENT_READ;
173
if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) {
174
*nperm |= BLK_PERM_WRITE;
176
*nshared = BLK_PERM_CONSISTENT_READ
178
| BLK_PERM_WRITE_UNCHANGED;
182
static int64_t coroutine_fn GRAPH_RDLOCK
183
replication_co_getlength(BlockDriverState *bs)
185
return bdrv_co_getlength(bs->file->bs);
188
static int replication_get_io_status(BDRVReplicationState *s)
191
case BLOCK_REPLICATION_NONE:
193
case BLOCK_REPLICATION_RUNNING:
195
case BLOCK_REPLICATION_FAILOVER:
196
return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
197
case BLOCK_REPLICATION_FAILOVER_FAILED:
198
return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 1;
199
case BLOCK_REPLICATION_DONE:
204
return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
210
static int replication_return_value(BDRVReplicationState *s, int ret)
212
if (s->mode == REPLICATION_MODE_SECONDARY) {
224
static int coroutine_fn GRAPH_RDLOCK
225
replication_co_readv(BlockDriverState *bs, int64_t sector_num,
226
int remaining_sectors, QEMUIOVector *qiov)
228
BDRVReplicationState *s = bs->opaque;
231
if (s->mode == REPLICATION_MODE_PRIMARY) {
236
ret = replication_get_io_status(s);
241
ret = bdrv_co_preadv(bs->file, sector_num * BDRV_SECTOR_SIZE,
242
remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
244
return replication_return_value(s, ret);
247
static int coroutine_fn GRAPH_RDLOCK
248
replication_co_writev(BlockDriverState *bs, int64_t sector_num,
249
int remaining_sectors, QEMUIOVector *qiov, int flags)
251
BDRVReplicationState *s = bs->opaque;
252
QEMUIOVector hd_qiov;
253
uint64_t bytes_done = 0;
254
BdrvChild *top = bs->file;
255
BdrvChild *base = s->secondary_disk;
260
ret = replication_get_io_status(s);
266
ret = bdrv_co_pwritev(top, sector_num * BDRV_SECTOR_SIZE,
267
remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
268
return replication_return_value(s, ret);
275
qemu_iovec_init(&hd_qiov, qiov->niov);
276
while (remaining_sectors > 0) {
279
ret = bdrv_co_is_allocated_above(top->bs, base->bs, false,
280
sector_num * BDRV_SECTOR_SIZE,
281
remaining_sectors * BDRV_SECTOR_SIZE,
287
assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
288
n = count >> BDRV_SECTOR_BITS;
289
qemu_iovec_reset(&hd_qiov);
290
qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count);
292
target = ret ? top : base;
293
ret = bdrv_co_pwritev(target, sector_num * BDRV_SECTOR_SIZE,
294
n * BDRV_SECTOR_SIZE, &hd_qiov, 0);
299
remaining_sectors -= n;
305
qemu_iovec_destroy(&hd_qiov);
310
static void GRAPH_UNLOCKED
311
secondary_do_checkpoint(BlockDriverState *bs, Error **errp)
313
BDRVReplicationState *s = bs->opaque;
314
BdrvChild *active_disk;
315
Error *local_err = NULL;
318
GRAPH_RDLOCK_GUARD_MAINLOOP();
320
if (!s->backup_job) {
321
error_setg(errp, "Backup job was cancelled unexpectedly");
325
backup_do_checkpoint(s->backup_job, &local_err);
327
error_propagate(errp, local_err);
331
active_disk = bs->file;
332
if (!active_disk->bs->drv) {
333
error_setg(errp, "Active disk %s is ejected",
334
active_disk->bs->node_name);
338
ret = bdrv_make_empty(active_disk, errp);
343
if (!s->hidden_disk->bs->drv) {
344
error_setg(errp, "Hidden disk %s is ejected",
345
s->hidden_disk->bs->node_name);
349
ret = bdrv_make_empty(s->hidden_disk, errp);
360
static void reopen_backing_file(BlockDriverState *bs, bool writable,
363
BDRVReplicationState *s = bs->opaque;
364
BdrvChild *hidden_disk, *secondary_disk;
365
BlockReopenQueue *reopen_queue = NULL;
368
GRAPH_RDLOCK_GUARD_MAINLOOP();
374
hidden_disk = bs->file->bs->backing;
375
secondary_disk = hidden_disk->bs->backing;
378
s->orig_hidden_read_only = bdrv_is_read_only(hidden_disk->bs);
379
s->orig_secondary_read_only = bdrv_is_read_only(secondary_disk->bs);
382
if (s->orig_hidden_read_only) {
383
QDict *opts = qdict_new();
384
qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
385
reopen_queue = bdrv_reopen_queue(reopen_queue, hidden_disk->bs,
389
if (s->orig_secondary_read_only) {
390
QDict *opts = qdict_new();
391
qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
392
reopen_queue = bdrv_reopen_queue(reopen_queue, secondary_disk->bs,
397
bdrv_reopen_multiple(reopen_queue, errp);
401
static void backup_job_cleanup(BlockDriverState *bs)
403
BDRVReplicationState *s = bs->opaque;
404
BlockDriverState *top_bs;
406
s->backup_job = NULL;
408
top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
412
bdrv_op_unblock_all(top_bs, s->blocker);
413
error_free(s->blocker);
414
reopen_backing_file(bs, false, NULL);
417
static void backup_job_completed(void *opaque, int ret)
419
BlockDriverState *bs = opaque;
420
BDRVReplicationState *s = bs->opaque;
422
if (s->stage != BLOCK_REPLICATION_FAILOVER) {
427
backup_job_cleanup(bs);
430
static bool GRAPH_RDLOCK
431
check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
441
QLIST_FOREACH(child, &top_bs->children, next) {
442
if (child->bs == bs || check_top_bs(child->bs, bs)) {
450
static void replication_start(ReplicationState *rs, ReplicationMode mode,
453
BlockDriverState *bs = rs->opaque;
454
BDRVReplicationState *s;
455
BlockDriverState *top_bs;
456
BdrvChild *active_disk, *hidden_disk, *secondary_disk;
457
int64_t active_length, hidden_length, disk_length;
458
Error *local_err = NULL;
459
BackupPerf perf = { .use_copy_range = true, .max_workers = 1 };
465
if (s->stage == BLOCK_REPLICATION_DONE ||
466
s->stage == BLOCK_REPLICATION_FAILOVER) {
475
if (s->stage != BLOCK_REPLICATION_NONE) {
476
error_setg(errp, "Block replication is running or done");
480
if (s->mode != mode) {
481
error_setg(errp, "The parameter mode's value is invalid, needs %d,"
482
" but got %d", s->mode, mode);
487
case REPLICATION_MODE_PRIMARY:
489
case REPLICATION_MODE_SECONDARY:
490
bdrv_graph_rdlock_main_loop();
491
active_disk = bs->file;
492
if (!active_disk || !active_disk->bs || !active_disk->bs->backing) {
493
error_setg(errp, "Active disk doesn't have backing file");
494
bdrv_graph_rdunlock_main_loop();
498
hidden_disk = active_disk->bs->backing;
499
if (!hidden_disk->bs || !hidden_disk->bs->backing) {
500
error_setg(errp, "Hidden disk doesn't have backing file");
501
bdrv_graph_rdunlock_main_loop();
505
secondary_disk = hidden_disk->bs->backing;
506
if (!secondary_disk->bs || !bdrv_has_blk(secondary_disk->bs)) {
507
error_setg(errp, "The secondary disk doesn't have block backend");
508
bdrv_graph_rdunlock_main_loop();
511
bdrv_graph_rdunlock_main_loop();
514
active_length = bdrv_getlength(active_disk->bs);
515
hidden_length = bdrv_getlength(hidden_disk->bs);
516
disk_length = bdrv_getlength(secondary_disk->bs);
517
if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
518
active_length != hidden_length || hidden_length != disk_length) {
519
error_setg(errp, "Active disk, hidden disk, secondary disk's length"
520
" are not the same");
525
assert(active_disk->bs->drv && hidden_disk->bs->drv);
527
bdrv_graph_rdlock_main_loop();
528
if (!active_disk->bs->drv->bdrv_make_empty ||
529
!hidden_disk->bs->drv->bdrv_make_empty) {
531
"Active disk or hidden disk doesn't support make_empty");
532
bdrv_graph_rdunlock_main_loop();
535
bdrv_graph_rdunlock_main_loop();
538
reopen_backing_file(bs, true, &local_err);
540
error_propagate(errp, local_err);
546
bdrv_ref(hidden_disk->bs);
547
s->hidden_disk = bdrv_attach_child(bs, hidden_disk->bs, "hidden disk",
548
&child_of_bds, BDRV_CHILD_DATA,
551
error_propagate(errp, local_err);
552
bdrv_graph_wrunlock();
556
bdrv_ref(secondary_disk->bs);
557
s->secondary_disk = bdrv_attach_child(bs, secondary_disk->bs,
558
"secondary disk", &child_of_bds,
559
BDRV_CHILD_DATA, &local_err);
561
error_propagate(errp, local_err);
562
bdrv_graph_wrunlock();
567
error_setg(&s->blocker,
568
"Block device is in use by internal backup job");
570
top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
571
if (!top_bs || !bdrv_is_root_node(top_bs) ||
572
!check_top_bs(top_bs, bs)) {
573
error_setg(errp, "No top_bs or it is invalid");
574
bdrv_graph_wrunlock();
575
reopen_backing_file(bs, false, NULL);
578
bdrv_op_block_all(top_bs, s->blocker);
579
bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
581
bdrv_graph_wrunlock();
583
s->backup_job = backup_job_create(
584
NULL, s->secondary_disk->bs, s->hidden_disk->bs,
585
0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, false,
587
BLOCKDEV_ON_ERROR_REPORT,
588
BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
589
backup_job_completed, bs, NULL, &local_err);
591
error_propagate(errp, local_err);
592
backup_job_cleanup(bs);
595
job_start(&s->backup_job->job);
601
s->stage = BLOCK_REPLICATION_RUNNING;
603
if (s->mode == REPLICATION_MODE_SECONDARY) {
604
secondary_do_checkpoint(bs, errp);
610
static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
612
BlockDriverState *bs = rs->opaque;
613
BDRVReplicationState *s = bs->opaque;
615
if (s->stage == BLOCK_REPLICATION_DONE ||
616
s->stage == BLOCK_REPLICATION_FAILOVER) {
625
if (s->mode == REPLICATION_MODE_SECONDARY) {
626
secondary_do_checkpoint(bs, errp);
630
static void replication_get_error(ReplicationState *rs, Error **errp)
632
BlockDriverState *bs = rs->opaque;
633
BDRVReplicationState *s = bs->opaque;
635
if (s->stage == BLOCK_REPLICATION_NONE) {
636
error_setg(errp, "Block replication is not running");
641
error_setg(errp, "I/O error occurred");
646
static void replication_done(void *opaque, int ret)
648
BlockDriverState *bs = opaque;
649
BDRVReplicationState *s = bs->opaque;
652
s->stage = BLOCK_REPLICATION_DONE;
655
bdrv_unref_child(bs, s->secondary_disk);
656
s->secondary_disk = NULL;
657
bdrv_unref_child(bs, s->hidden_disk);
658
s->hidden_disk = NULL;
659
bdrv_graph_wrunlock();
663
s->stage = BLOCK_REPLICATION_FAILOVER_FAILED;
668
static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
670
BlockDriverState *bs = rs->opaque;
671
BDRVReplicationState *s = bs->opaque;
673
if (s->stage == BLOCK_REPLICATION_DONE ||
674
s->stage == BLOCK_REPLICATION_FAILOVER) {
683
if (s->stage != BLOCK_REPLICATION_RUNNING) {
684
error_setg(errp, "Block replication is not running");
689
case REPLICATION_MODE_PRIMARY:
690
s->stage = BLOCK_REPLICATION_DONE;
693
case REPLICATION_MODE_SECONDARY:
700
job_cancel_sync(&s->backup_job->job, true);
704
secondary_do_checkpoint(bs, errp);
705
s->stage = BLOCK_REPLICATION_DONE;
709
bdrv_graph_rdlock_main_loop();
710
s->stage = BLOCK_REPLICATION_FAILOVER;
711
s->commit_job = commit_active_start(
712
NULL, bs->file->bs, s->secondary_disk->bs,
713
JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
714
NULL, replication_done, bs, true, errp);
715
bdrv_graph_rdunlock_main_loop();
722
static const char *const replication_strong_runtime_opts[] = {
729
static BlockDriver bdrv_replication = {
730
.format_name = "replication",
731
.instance_size = sizeof(BDRVReplicationState),
733
.bdrv_open = replication_open,
734
.bdrv_close = replication_close,
735
.bdrv_child_perm = replication_child_perm,
737
.bdrv_co_getlength = replication_co_getlength,
738
.bdrv_co_readv = replication_co_readv,
739
.bdrv_co_writev = replication_co_writev,
743
.strong_runtime_opts = replication_strong_runtime_opts,
746
static void bdrv_replication_init(void)
748
bdrv_register(&bdrv_replication);
751
block_init(bdrv_replication_init);