16
#include "qemu/osdep.h"
17
#include "qemu/cutils.h"
18
#include "qemu/module.h"
19
#include "qemu/option.h"
20
#include "qemu/memalign.h"
21
#include "block/block_int.h"
22
#include "block/coroutines.h"
23
#include "block/qdict.h"
24
#include "qapi/error.h"
25
#include "qapi/qapi-events-block.h"
26
#include "qapi/qmp/qdict.h"
27
#include "qapi/qmp/qerror.h"
28
#include "qapi/qmp/qlist.h"
29
#include "qapi/qmp/qstring.h"
30
#include "crypto/hash.h"
34
#define INDEXSTR_LEN 32
36
#define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold"
37
#define QUORUM_OPT_BLKVERIFY "blkverify"
38
#define QUORUM_OPT_REWRITE "rewrite-corrupted"
39
#define QUORUM_OPT_READ_PATTERN "read-pattern"
42
typedef union QuorumVoteValue {
43
uint8_t h[HASH_LENGTH];
48
typedef struct QuorumVoteItem {
50
QLIST_ENTRY(QuorumVoteItem) next;
58
typedef struct QuorumVoteVersion {
59
QuorumVoteValue value;
62
QLIST_HEAD(, QuorumVoteItem) items;
63
QLIST_ENTRY(QuorumVoteVersion) next;
67
typedef struct QuorumVotes {
68
QLIST_HEAD(, QuorumVoteVersion) vote_list;
69
bool (*compare)(QuorumVoteValue *a, QuorumVoteValue *b);
73
typedef struct BDRVQuorumState {
76
unsigned next_child_index;
90
bool rewrite_corrupted;
94
QuorumReadPattern read_pattern;
97
typedef struct QuorumAIOCB QuorumAIOCB;
104
typedef struct QuorumChildRequest {
105
BlockDriverState *bs;
118
BlockDriverState *bs;
128
QuorumChildRequest *qcrs;
143
typedef struct QuorumCo {
148
static void quorum_aio_finalize(QuorumAIOCB *acb)
154
static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b)
156
return !memcmp(a->h, b->h, HASH_LENGTH);
159
static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b)
164
static QuorumAIOCB *coroutine_fn quorum_aio_get(BlockDriverState *bs,
166
uint64_t offset, uint64_t bytes,
169
BDRVQuorumState *s = bs->opaque;
170
QuorumAIOCB *acb = g_new(QuorumAIOCB, 1);
173
*acb = (QuorumAIOCB) {
174
.co = qemu_coroutine_self(),
180
.votes.compare = quorum_sha256_compare,
181
.votes.vote_list = QLIST_HEAD_INITIALIZER(acb.votes.vote_list),
184
acb->qcrs = g_new0(QuorumChildRequest, s->num_children);
185
for (i = 0; i < s->num_children; i++) {
186
acb->qcrs[i].buf = NULL;
187
acb->qcrs[i].ret = 0;
188
acb->qcrs[i].parent = acb;
194
static void quorum_report_bad(QuorumOpType type, uint64_t offset,
195
uint64_t bytes, char *node_name, int ret)
197
const char *msg = NULL;
198
int64_t start_sector = offset / BDRV_SECTOR_SIZE;
199
int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
202
msg = strerror(-ret);
205
qapi_event_send_quorum_report_bad(type, msg, node_name, start_sector,
206
end_sector - start_sector);
209
static void GRAPH_RDLOCK quorum_report_failure(QuorumAIOCB *acb)
211
const char *reference = bdrv_get_device_or_node_name(acb->bs);
212
int64_t start_sector = acb->offset / BDRV_SECTOR_SIZE;
213
int64_t end_sector = DIV_ROUND_UP(acb->offset + acb->bytes,
216
qapi_event_send_quorum_failure(reference, start_sector,
217
end_sector - start_sector);
220
static int quorum_vote_error(QuorumAIOCB *acb);
222
static bool GRAPH_RDLOCK quorum_has_too_much_io_failed(QuorumAIOCB *acb)
224
BDRVQuorumState *s = acb->bs->opaque;
226
if (acb->success_count < s->threshold) {
227
acb->vote_ret = quorum_vote_error(acb);
228
quorum_report_failure(acb);
235
static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
238
assert(dest->niov == source->niov);
239
assert(dest->size == source->size);
240
for (i = 0; i < source->niov; i++) {
241
assert(dest->iov[i].iov_len == source->iov[i].iov_len);
242
memcpy(dest->iov[i].iov_base,
243
source->iov[i].iov_base,
244
source->iov[i].iov_len);
248
static void quorum_report_bad_acb(QuorumChildRequest *sacb, int ret)
250
QuorumAIOCB *acb = sacb->parent;
251
QuorumOpType type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE;
252
quorum_report_bad(type, acb->offset, acb->bytes, sacb->bs->node_name, ret);
255
static void quorum_report_bad_versions(BDRVQuorumState *s,
257
QuorumVoteValue *value)
259
QuorumVoteVersion *version;
260
QuorumVoteItem *item;
262
QLIST_FOREACH(version, &acb->votes.vote_list, next) {
263
if (acb->votes.compare(&version->value, value)) {
266
QLIST_FOREACH(item, &version->items, next) {
267
quorum_report_bad(QUORUM_OP_TYPE_READ, acb->offset, acb->bytes,
268
s->children[item->index]->bs->node_name, 0);
277
static void coroutine_fn GRAPH_RDLOCK quorum_rewrite_entry(void *opaque)
279
QuorumCo *co = opaque;
280
QuorumAIOCB *acb = co->acb;
281
BDRVQuorumState *s = acb->bs->opaque;
287
bdrv_co_pwritev(s->children[co->idx], acb->offset, acb->bytes,
288
acb->qiov, acb->flags & ~BDRV_REQ_WRITE_UNCHANGED);
291
acb->rewrite_count--;
292
if (!acb->rewrite_count) {
293
qemu_coroutine_enter_if_inactive(acb->co);
297
static bool coroutine_fn GRAPH_RDLOCK
298
quorum_rewrite_bad_versions(QuorumAIOCB *acb, QuorumVoteValue *value)
300
QuorumVoteVersion *version;
301
QuorumVoteItem *item;
307
QLIST_FOREACH(version, &acb->votes.vote_list, next) {
308
if (acb->votes.compare(&version->value, value)) {
311
QLIST_FOREACH(item, &version->items, next) {
317
acb->rewrite_count = count;
320
QLIST_FOREACH(version, &acb->votes.vote_list, next) {
321
if (acb->votes.compare(&version->value, value)) {
324
QLIST_FOREACH(item, &version->items, next) {
331
co = qemu_coroutine_create(quorum_rewrite_entry, &data);
332
qemu_coroutine_enter(co);
340
static void quorum_count_vote(QuorumVotes *votes,
341
QuorumVoteValue *value,
344
QuorumVoteVersion *v = NULL, *version = NULL;
345
QuorumVoteItem *item;
348
QLIST_FOREACH(v, &votes->vote_list, next) {
349
if (votes->compare(&v->value, value)) {
357
version = g_new0(QuorumVoteVersion, 1);
358
QLIST_INIT(&version->items);
359
memcpy(&version->value, value, sizeof(version->value));
360
version->index = index;
361
version->vote_count = 0;
362
QLIST_INSERT_HEAD(&votes->vote_list, version, next);
365
version->vote_count++;
367
item = g_new0(QuorumVoteItem, 1);
369
QLIST_INSERT_HEAD(&version->items, item, next);
372
static void quorum_free_vote_list(QuorumVotes *votes)
374
QuorumVoteVersion *version, *next_version;
375
QuorumVoteItem *item, *next_item;
377
QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) {
378
QLIST_REMOVE(version, next);
379
QLIST_FOREACH_SAFE(item, &version->items, next, next_item) {
380
QLIST_REMOVE(item, next);
387
static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash)
389
QEMUIOVector *qiov = &acb->qcrs[i].qiov;
390
size_t len = sizeof(hash->h);
391
uint8_t *data = hash->h;
396
if (qcrypto_hash_bytesv(QCRYPTO_HASH_ALG_SHA256,
397
qiov->iov, qiov->niov,
406
static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
409
QuorumVoteVersion *candidate, *winner = NULL;
411
QLIST_FOREACH(candidate, &votes->vote_list, next) {
412
if (candidate->vote_count > max) {
413
max = candidate->vote_count;
427
static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
432
assert(a->niov == b->niov);
433
for (i = 0; i < a->niov; i++) {
434
assert(a->iov[i].iov_len == b->iov[i].iov_len);
435
result = memcmp(a->iov[i].iov_base,
446
static bool quorum_compare(QuorumAIOCB *acb, QEMUIOVector *a, QEMUIOVector *b)
448
BDRVQuorumState *s = acb->bs->opaque;
452
if (s->is_blkverify) {
453
offset = qemu_iovec_compare(a, b);
455
fprintf(stderr, "quorum: offset=%" PRIu64 " bytes=%" PRIu64
456
" contents mismatch at offset %" PRIu64 "\n",
457
acb->offset, acb->bytes, acb->offset + offset);
463
return quorum_iovec_compare(a, b);
467
static int quorum_vote_error(QuorumAIOCB *acb)
469
BDRVQuorumState *s = acb->bs->opaque;
470
QuorumVoteVersion *winner = NULL;
471
QuorumVotes error_votes;
472
QuorumVoteValue result_value;
476
QLIST_INIT(&error_votes.vote_list);
477
error_votes.compare = quorum_64bits_compare;
479
for (i = 0; i < s->num_children; i++) {
480
ret = acb->qcrs[i].ret;
483
result_value.l = ret;
484
quorum_count_vote(&error_votes, &result_value, i);
489
winner = quorum_get_vote_winner(&error_votes);
490
ret = winner->value.l;
493
quorum_free_vote_list(&error_votes);
498
static void coroutine_fn GRAPH_RDLOCK quorum_vote(QuorumAIOCB *acb)
502
QuorumVoteValue hash;
503
BDRVQuorumState *s = acb->bs->opaque;
504
QuorumVoteVersion *winner;
506
if (quorum_has_too_much_io_failed(acb)) {
511
for (i = 0; i < s->num_children; i++) {
512
if (!acb->qcrs[i].ret) {
517
assert(i < s->num_children);
522
for (j = i + 1; j < s->num_children; j++) {
523
if (acb->qcrs[j].ret) {
526
quorum = quorum_compare(acb, &acb->qcrs[i].qiov, &acb->qcrs[j].qiov);
534
quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov);
539
for (i = 0; i < s->num_children; i++) {
540
if (acb->qcrs[i].ret) {
543
ret = quorum_compute_hash(acb, i, &hash);
549
quorum_count_vote(&acb->votes, &hash, i);
553
winner = quorum_get_vote_winner(&acb->votes);
556
if (winner->vote_count < s->threshold) {
557
quorum_report_failure(acb);
558
acb->vote_ret = -EIO;
563
quorum_copy_qiov(acb->qiov, &acb->qcrs[winner->index].qiov);
566
quorum_report_bad_versions(s, acb, &winner->value);
569
if (s->rewrite_corrupted) {
570
quorum_rewrite_bad_versions(acb, &winner->value);
575
quorum_free_vote_list(&acb->votes);
582
static void coroutine_fn GRAPH_RDLOCK read_quorum_children_entry(void *opaque)
584
QuorumCo *co = opaque;
585
QuorumAIOCB *acb = co->acb;
586
BDRVQuorumState *s = acb->bs->opaque;
588
QuorumChildRequest *sacb = &acb->qcrs[i];
590
sacb->bs = s->children[i]->bs;
591
sacb->ret = bdrv_co_preadv(s->children[i], acb->offset, acb->bytes,
592
&acb->qcrs[i].qiov, 0);
594
if (sacb->ret == 0) {
595
acb->success_count++;
597
quorum_report_bad_acb(sacb, sacb->ret);
601
assert(acb->count <= s->num_children);
602
assert(acb->success_count <= s->num_children);
605
if (acb->count == s->num_children) {
606
qemu_coroutine_enter_if_inactive(acb->co);
610
static int coroutine_fn GRAPH_RDLOCK read_quorum_children(QuorumAIOCB *acb)
612
BDRVQuorumState *s = acb->bs->opaque;
615
acb->children_read = s->num_children;
616
for (i = 0; i < s->num_children; i++) {
617
acb->qcrs[i].buf = qemu_blockalign(s->children[i]->bs, acb->qiov->size);
618
qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov);
619
qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf);
622
for (i = 0; i < s->num_children; i++) {
629
co = qemu_coroutine_create(read_quorum_children_entry, &data);
630
qemu_coroutine_enter(co);
633
while (acb->count < s->num_children) {
634
qemu_coroutine_yield();
639
for (i = 0; i < s->num_children; i++) {
640
qemu_vfree(acb->qcrs[i].buf);
641
qemu_iovec_destroy(&acb->qcrs[i].qiov);
644
while (acb->rewrite_count) {
645
qemu_coroutine_yield();
648
return acb->vote_ret;
651
static int coroutine_fn GRAPH_RDLOCK read_fifo_child(QuorumAIOCB *acb)
653
BDRVQuorumState *s = acb->bs->opaque;
658
n = acb->children_read++;
659
acb->qcrs[n].bs = s->children[n]->bs;
660
ret = bdrv_co_preadv(s->children[n], acb->offset, acb->bytes,
663
quorum_report_bad_acb(&acb->qcrs[n], ret);
665
} while (ret < 0 && acb->children_read < s->num_children);
672
static int coroutine_fn GRAPH_RDLOCK
673
quorum_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
674
QEMUIOVector *qiov, BdrvRequestFlags flags)
676
BDRVQuorumState *s = bs->opaque;
677
QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
681
acb->children_read = 0;
683
if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
684
ret = read_quorum_children(acb);
686
ret = read_fifo_child(acb);
688
quorum_aio_finalize(acb);
697
static void coroutine_fn GRAPH_RDLOCK write_quorum_entry(void *opaque)
699
QuorumCo *co = opaque;
700
QuorumAIOCB *acb = co->acb;
701
BDRVQuorumState *s = acb->bs->opaque;
703
QuorumChildRequest *sacb = &acb->qcrs[i];
705
sacb->bs = s->children[i]->bs;
706
if (acb->flags & BDRV_REQ_ZERO_WRITE) {
707
sacb->ret = bdrv_co_pwrite_zeroes(s->children[i], acb->offset,
708
acb->bytes, acb->flags);
710
sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes,
711
acb->qiov, acb->flags);
713
if (sacb->ret == 0) {
714
acb->success_count++;
716
quorum_report_bad_acb(sacb, sacb->ret);
719
assert(acb->count <= s->num_children);
720
assert(acb->success_count <= s->num_children);
723
if (acb->count == s->num_children) {
724
qemu_coroutine_enter_if_inactive(acb->co);
728
static int coroutine_fn GRAPH_RDLOCK
729
quorum_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
730
QEMUIOVector *qiov, BdrvRequestFlags flags)
732
BDRVQuorumState *s = bs->opaque;
733
QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags);
736
for (i = 0; i < s->num_children; i++) {
743
co = qemu_coroutine_create(write_quorum_entry, &data);
744
qemu_coroutine_enter(co);
747
while (acb->count < s->num_children) {
748
qemu_coroutine_yield();
751
quorum_has_too_much_io_failed(acb);
754
quorum_aio_finalize(acb);
759
static int coroutine_fn GRAPH_RDLOCK
760
quorum_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
761
BdrvRequestFlags flags)
763
return quorum_co_pwritev(bs, offset, bytes, NULL,
764
flags | BDRV_REQ_ZERO_WRITE);
767
static int64_t coroutine_fn GRAPH_RDLOCK
768
quorum_co_getlength(BlockDriverState *bs)
770
BDRVQuorumState *s = bs->opaque;
775
result = bdrv_co_getlength(s->children[0]->bs);
779
for (i = 1; i < s->num_children; i++) {
780
int64_t value = bdrv_co_getlength(s->children[i]->bs);
784
if (value != result) {
792
static coroutine_fn GRAPH_RDLOCK int quorum_co_flush(BlockDriverState *bs)
794
BDRVQuorumState *s = bs->opaque;
795
QuorumVoteVersion *winner = NULL;
796
QuorumVotes error_votes;
797
QuorumVoteValue result_value;
800
int success_count = 0;
802
QLIST_INIT(&error_votes.vote_list);
803
error_votes.compare = quorum_64bits_compare;
805
for (i = 0; i < s->num_children; i++) {
806
result = bdrv_co_flush(s->children[i]->bs);
808
quorum_report_bad(QUORUM_OP_TYPE_FLUSH, 0, 0,
809
s->children[i]->bs->node_name, result);
810
result_value.l = result;
811
quorum_count_vote(&error_votes, &result_value, i);
817
if (success_count >= s->threshold) {
820
winner = quorum_get_vote_winner(&error_votes);
821
result = winner->value.l;
823
quorum_free_vote_list(&error_votes);
828
static bool GRAPH_RDLOCK
829
quorum_recurse_can_replace(BlockDriverState *bs, BlockDriverState *to_replace)
831
BDRVQuorumState *s = bs->opaque;
834
for (i = 0; i < s->num_children; i++) {
862
if (s->children[i]->bs == to_replace) {
873
return QLIST_FIRST(&to_replace->parents) == s->children[i] &&
874
QLIST_NEXT(s->children[i], next_parent) == NULL;
881
static int quorum_valid_threshold(int threshold, int num_children, Error **errp)
885
error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
886
"vote-threshold", "a value >= 1");
890
if (threshold > num_children) {
891
error_setg(errp, "threshold may not exceed children count");
898
static QemuOptsList quorum_runtime_opts = {
900
.head = QTAILQ_HEAD_INITIALIZER(quorum_runtime_opts.head),
903
.name = QUORUM_OPT_VOTE_THRESHOLD,
904
.type = QEMU_OPT_NUMBER,
905
.help = "The number of vote needed for reaching quorum",
908
.name = QUORUM_OPT_BLKVERIFY,
909
.type = QEMU_OPT_BOOL,
910
.help = "Trigger block verify mode if set",
913
.name = QUORUM_OPT_REWRITE,
914
.type = QEMU_OPT_BOOL,
915
.help = "Rewrite corrupted block on read quorum",
918
.name = QUORUM_OPT_READ_PATTERN,
919
.type = QEMU_OPT_STRING,
920
.help = "Allowed pattern: quorum, fifo. Quorum is default",
926
static void quorum_refresh_flags(BlockDriverState *bs)
928
BDRVQuorumState *s = bs->opaque;
931
bs->supported_zero_flags =
932
BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
934
for (i = 0; i < s->num_children; i++) {
935
bs->supported_zero_flags &= s->children[i]->bs->supported_zero_flags;
938
bs->supported_zero_flags |= BDRV_REQ_WRITE_UNCHANGED;
941
static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
944
BDRVQuorumState *s = bs->opaque;
945
QemuOpts *opts = NULL;
946
const char *pattern_str;
951
qdict_flatten(options);
954
s->num_children = qdict_array_entries(options, "children.");
955
if (s->num_children < 0) {
956
error_setg(errp, "Option children is not a valid array");
960
if (s->num_children < 1) {
961
error_setg(errp, "Number of provided children must be 1 or more");
966
opts = qemu_opts_create(&quorum_runtime_opts, NULL, 0, &error_abort);
967
if (!qemu_opts_absorb_qdict(opts, options, errp)) {
972
s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0);
974
ret = quorum_valid_threshold(s->threshold, s->num_children, errp);
979
pattern_str = qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN);
981
ret = QUORUM_READ_PATTERN_QUORUM;
983
ret = qapi_enum_parse(&QuorumReadPattern_lookup, pattern_str,
987
error_setg(errp, "Please set read-pattern as fifo or quorum");
990
s->read_pattern = ret;
992
if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
993
s->is_blkverify = qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false);
994
if (s->is_blkverify && (s->num_children != 2 || s->threshold != 2)) {
995
error_setg(errp, "blkverify=on can only be set if there are "
996
"exactly two files and vote-threshold is 2");
1001
s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE,
1003
if (s->rewrite_corrupted && s->is_blkverify) {
1005
"rewrite-corrupted=on cannot be used with blkverify=on");
1012
s->children = g_new0(BdrvChild *, s->num_children);
1013
opened = g_new0(bool, s->num_children);
1015
for (i = 0; i < s->num_children; i++) {
1016
char indexstr[INDEXSTR_LEN];
1017
ret = snprintf(indexstr, INDEXSTR_LEN, "children.%d", i);
1018
assert(ret < INDEXSTR_LEN);
1020
s->children[i] = bdrv_open_child(NULL, options, indexstr, bs,
1021
&child_of_bds, BDRV_CHILD_DATA, false,
1023
if (!s->children[i]) {
1030
s->next_child_index = s->num_children;
1032
bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED;
1033
quorum_refresh_flags(bs);
1040
bdrv_graph_wrlock();
1041
for (i = 0; i < s->num_children; i++) {
1045
bdrv_unref_child(bs, s->children[i]);
1047
bdrv_graph_wrunlock();
1048
g_free(s->children);
1051
qemu_opts_del(opts);
1055
static void quorum_close(BlockDriverState *bs)
1057
BDRVQuorumState *s = bs->opaque;
1060
bdrv_graph_wrlock();
1061
for (i = 0; i < s->num_children; i++) {
1062
bdrv_unref_child(bs, s->children[i]);
1064
bdrv_graph_wrunlock();
1066
g_free(s->children);
1069
static void GRAPH_WRLOCK
1070
quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs, Error **errp)
1072
BDRVQuorumState *s = bs->opaque;
1074
char indexstr[INDEXSTR_LEN];
1077
if (s->is_blkverify) {
1078
error_setg(errp, "Cannot add a child to a quorum in blkverify mode");
1082
assert(s->num_children <= INT_MAX / sizeof(BdrvChild *));
1083
if (s->num_children == INT_MAX / sizeof(BdrvChild *) ||
1084
s->next_child_index == UINT_MAX) {
1085
error_setg(errp, "Too many children");
1089
ret = snprintf(indexstr, INDEXSTR_LEN, "children.%u", s->next_child_index);
1090
if (ret < 0 || ret >= INDEXSTR_LEN) {
1091
error_setg(errp, "cannot generate child name");
1094
s->next_child_index++;
1099
child = bdrv_attach_child(bs, child_bs, indexstr, &child_of_bds,
1100
BDRV_CHILD_DATA, errp);
1101
if (child == NULL) {
1102
s->next_child_index--;
1105
s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
1106
s->children[s->num_children++] = child;
1107
quorum_refresh_flags(bs);
1110
static void GRAPH_WRLOCK
1111
quorum_del_child(BlockDriverState *bs, BdrvChild *child, Error **errp)
1113
BDRVQuorumState *s = bs->opaque;
1114
char indexstr[INDEXSTR_LEN];
1117
for (i = 0; i < s->num_children; i++) {
1118
if (s->children[i] == child) {
1124
assert(i < s->num_children);
1126
if (s->num_children <= s->threshold) {
1128
"The number of children cannot be lower than the vote threshold %d",
1134
assert(!s->is_blkverify);
1136
snprintf(indexstr, INDEXSTR_LEN, "children.%u", s->next_child_index - 1);
1137
if (!strncmp(child->name, indexstr, INDEXSTR_LEN)) {
1138
s->next_child_index--;
1142
memmove(&s->children[i], &s->children[i + 1],
1143
(s->num_children - i - 1) * sizeof(BdrvChild *));
1144
s->children = g_renew(BdrvChild *, s->children, --s->num_children);
1146
bdrv_unref_child(bs, child);
1148
quorum_refresh_flags(bs);
1151
static void quorum_gather_child_options(BlockDriverState *bs, QDict *target,
1152
bool backing_overridden)
1154
BDRVQuorumState *s = bs->opaque;
1155
QList *children_list;
1181
children_list = qlist_new();
1182
qdict_put(target, "children", children_list);
1184
for (i = 0; i < s->num_children; i++) {
1185
qlist_append(children_list,
1186
qobject_ref(s->children[i]->bs->full_open_options));
1190
static char *quorum_dirname(BlockDriverState *bs, Error **errp)
1196
error_setg(errp, "Cannot generate a base directory for quorum nodes");
1200
static void quorum_child_perm(BlockDriverState *bs, BdrvChild *c,
1202
BlockReopenQueue *reopen_queue,
1203
uint64_t perm, uint64_t shared,
1204
uint64_t *nperm, uint64_t *nshared)
1206
BDRVQuorumState *s = bs->opaque;
1208
*nperm = perm & DEFAULT_PERM_PASSTHROUGH;
1209
if (s->rewrite_corrupted) {
1210
*nperm |= BLK_PERM_WRITE;
1217
*nshared = (shared & (BLK_PERM_CONSISTENT_READ |
1218
BLK_PERM_WRITE_UNCHANGED))
1219
| DEFAULT_PERM_UNCHANGED;
1228
static int coroutine_fn GRAPH_RDLOCK
1229
quorum_co_block_status(BlockDriverState *bs, bool want_zero,
1230
int64_t offset, int64_t count,
1231
int64_t *pnum, int64_t *map, BlockDriverState **file)
1233
BDRVQuorumState *s = bs->opaque;
1235
int64_t pnum_zero = count;
1236
int64_t pnum_data = 0;
1238
for (i = 0; i < s->num_children; i++) {
1240
ret = bdrv_co_common_block_status_above(s->children[i]->bs, NULL, false,
1241
want_zero, offset, count,
1242
&bytes, NULL, NULL, NULL);
1244
quorum_report_bad(QUORUM_OP_TYPE_READ, offset, count,
1245
s->children[i]->bs->node_name, ret);
1255
if (ret & BDRV_BLOCK_ZERO) {
1256
pnum_zero = MIN(pnum_zero, bytes);
1258
pnum_data = MAX(pnum_data, bytes);
1264
return BDRV_BLOCK_DATA;
1267
return BDRV_BLOCK_ZERO;
1271
static const char *const quorum_strong_runtime_opts[] = {
1272
QUORUM_OPT_VOTE_THRESHOLD,
1273
QUORUM_OPT_BLKVERIFY,
1275
QUORUM_OPT_READ_PATTERN,
1280
static BlockDriver bdrv_quorum = {
1281
.format_name = "quorum",
1283
.instance_size = sizeof(BDRVQuorumState),
1285
.bdrv_open = quorum_open,
1286
.bdrv_close = quorum_close,
1287
.bdrv_gather_child_options = quorum_gather_child_options,
1288
.bdrv_dirname = quorum_dirname,
1289
.bdrv_co_block_status = quorum_co_block_status,
1291
.bdrv_co_flush = quorum_co_flush,
1293
.bdrv_co_getlength = quorum_co_getlength,
1295
.bdrv_co_preadv = quorum_co_preadv,
1296
.bdrv_co_pwritev = quorum_co_pwritev,
1297
.bdrv_co_pwrite_zeroes = quorum_co_pwrite_zeroes,
1299
.bdrv_add_child = quorum_add_child,
1300
.bdrv_del_child = quorum_del_child,
1302
.bdrv_child_perm = quorum_child_perm,
1304
.bdrv_recurse_can_replace = quorum_recurse_can_replace,
1306
.strong_runtime_opts = quorum_strong_runtime_opts,
1309
static void bdrv_quorum_init(void)
1311
if (!qcrypto_hash_supports(QCRYPTO_HASH_ALG_SHA256)) {
1315
bdrv_register(&bdrv_quorum);
1318
block_init(bdrv_quorum_init);