qemu

Форк
0
/
preallocate.c 
627 строк · 18.4 Кб
1
/*
2
 * preallocate filter driver
3
 *
4
 * The driver performs preallocate operation: it is injected above
5
 * some node, and before each write over EOF it does additional preallocating
6
 * write-zeroes request.
7
 *
8
 * Copyright (c) 2020 Virtuozzo International GmbH.
9
 *
10
 * Author:
11
 *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
12
 *
13
 * This program is free software; you can redistribute it and/or modify
14
 * it under the terms of the GNU General Public License as published by
15
 * the Free Software Foundation; either version 2 of the License, or
16
 * (at your option) any later version.
17
 *
18
 * This program is distributed in the hope that it will be useful,
19
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
 * GNU General Public License for more details.
22
 *
23
 * You should have received a copy of the GNU General Public License
24
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
25
 */
26

27
#include "qemu/osdep.h"
28

29
#include "qapi/error.h"
30
#include "qemu/module.h"
31
#include "qemu/option.h"
32
#include "qemu/units.h"
33
#include "block/block-io.h"
34
#include "block/block_int.h"
35

36

37
typedef struct PreallocateOpts {
38
    int64_t prealloc_size;
39
    int64_t prealloc_align;
40
} PreallocateOpts;
41

42
typedef struct BDRVPreallocateState {
43
    PreallocateOpts opts;
44

45
    /*
46
     * Track real data end, to crop preallocation on close. If < 0 the status is
47
     * unknown.
48
     *
49
     * @data_end is a maximum of file size on open (or when we get write/resize
50
     * permissions) and all write request ends after it. So it's safe to
51
     * truncate to data_end if it is valid.
52
     */
53
    int64_t data_end;
54

55
    /*
56
     * Start of trailing preallocated area which reads as zero. May be smaller
57
     * than data_end, if user does over-EOF write zero operation. If < 0 the
58
     * status is unknown.
59
     *
60
     * If both @zero_start and @file_end are valid, the region
61
     * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
62
     * is not valid, @zero_start doesn't make much sense.
63
     */
64
    int64_t zero_start;
65

66
    /*
67
     * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
68
     * to avoid extra lseek() calls on each write operation. If < 0 the status
69
     * is unknown.
70
     */
71
    int64_t file_end;
72

73
    /*
74
     * All three states @data_end, @zero_start and @file_end are guaranteed to
75
     * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
76
     * BLK_PERM_WRITE permissions on file child.
77
     */
78

79
    /* Gives up the resize permission on children when parents don't need it */
80
    QEMUBH *drop_resize_bh;
81
} BDRVPreallocateState;
82

83
static int preallocate_drop_resize(BlockDriverState *bs, Error **errp);
84
static void preallocate_drop_resize_bh(void *opaque);
85

86
#define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
87
#define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
88
static QemuOptsList runtime_opts = {
89
    .name = "preallocate",
90
    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
91
    .desc = {
92
        {
93
            .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
94
            .type = QEMU_OPT_SIZE,
95
            .help = "on preallocation, align file length to this number, "
96
                "default 1M",
97
        },
98
        {
99
            .name = PREALLOCATE_OPT_PREALLOC_SIZE,
100
            .type = QEMU_OPT_SIZE,
101
            .help = "how much to preallocate, default 128M",
102
        },
103
        { /* end of list */ }
104
    },
105
};
106

107
static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
108
                                    BlockDriverState *child_bs, Error **errp)
109
{
110
    QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
111

112
    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
113
        return false;
114
    }
115

116
    dest->prealloc_align =
117
        qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
118
    dest->prealloc_size =
119
        qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
120

121
    qemu_opts_del(opts);
122

123
    if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
124
        error_setg(errp, "prealloc-align parameter of preallocate filter "
125
                   "is not aligned to %llu", BDRV_SECTOR_SIZE);
126
        return false;
127
    }
128

129
    if (!QEMU_IS_ALIGNED(dest->prealloc_align,
130
                         child_bs->bl.request_alignment)) {
131
        error_setg(errp, "prealloc-align parameter of preallocate filter "
132
                   "is not aligned to underlying node request alignment "
133
                   "(%" PRIi32 ")", child_bs->bl.request_alignment);
134
        return false;
135
    }
136

137
    return true;
138
}
139

140
static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
141
                            Error **errp)
142
{
143
    BDRVPreallocateState *s = bs->opaque;
144
    int ret;
145

146
    GLOBAL_STATE_CODE();
147

148
    /*
149
     * s->data_end and friends should be initialized on permission update.
150
     * For this to work, mark them invalid.
151
     */
152
    s->file_end = s->zero_start = s->data_end = -EINVAL;
153
    s->drop_resize_bh = qemu_bh_new(preallocate_drop_resize_bh, bs);
154

155
    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
156
    if (ret < 0) {
157
        return ret;
158
    }
159

160
    GRAPH_RDLOCK_GUARD_MAINLOOP();
161

162
    if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
163
        return -EINVAL;
164
    }
165

166
    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
167
        (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
168

169
    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
170
        ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
171
            bs->file->bs->supported_zero_flags);
172

173
    return 0;
174
}
175

176
static int GRAPH_RDLOCK
177
preallocate_truncate_to_real_size(BlockDriverState *bs, Error **errp)
178
{
179
    BDRVPreallocateState *s = bs->opaque;
180
    int ret;
181

182
    if (s->file_end < 0) {
183
        s->file_end = bdrv_getlength(bs->file->bs);
184
        if (s->file_end < 0) {
185
            error_setg_errno(errp, -s->file_end, "Failed to get file length");
186
            return s->file_end;
187
        }
188
    }
189

190
    if (s->data_end < s->file_end) {
191
        ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
192
                            NULL);
193
        if (ret < 0) {
194
            error_setg_errno(errp, -ret, "Failed to drop preallocation");
195
            s->file_end = ret;
196
            return ret;
197
        }
198
        s->file_end = s->data_end;
199
    }
200

201
    return 0;
202
}
203

204
static void preallocate_close(BlockDriverState *bs)
205
{
206
    BDRVPreallocateState *s = bs->opaque;
207

208
    GLOBAL_STATE_CODE();
209
    GRAPH_RDLOCK_GUARD_MAINLOOP();
210

211
    qemu_bh_cancel(s->drop_resize_bh);
212
    qemu_bh_delete(s->drop_resize_bh);
213

214
    if (s->data_end >= 0) {
215
        preallocate_truncate_to_real_size(bs, NULL);
216
    }
217
}
218

219

220
/*
221
 * Handle reopen.
222
 *
223
 * We must implement reopen handlers, otherwise reopen just don't work. Handle
224
 * new options and don't care about preallocation state, as it is handled in
225
 * set/check permission handlers.
226
 */
227

228
static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
229
                                      BlockReopenQueue *queue, Error **errp)
230
{
231
    PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
232
    int ret;
233

234
    GLOBAL_STATE_CODE();
235
    GRAPH_RDLOCK_GUARD_MAINLOOP();
236

237
    if (!preallocate_absorb_opts(opts, reopen_state->options,
238
                                 reopen_state->bs->file->bs, errp)) {
239
        g_free(opts);
240
        return -EINVAL;
241
    }
242

243
    /*
244
     * Drop the preallocation already here if reopening read-only. The child
245
     * might also be reopened read-only and then scheduling a BH during the
246
     * permission update is too late.
247
     */
248
    if ((reopen_state->flags & BDRV_O_RDWR) == 0) {
249
        ret = preallocate_drop_resize(reopen_state->bs, errp);
250
        if (ret < 0) {
251
            g_free(opts);
252
            return ret;
253
        }
254
    }
255

256
    reopen_state->opaque = opts;
257

258
    return 0;
259
}
260

261
static void preallocate_reopen_commit(BDRVReopenState *state)
262
{
263
    BDRVPreallocateState *s = state->bs->opaque;
264

265
    s->opts = *(PreallocateOpts *)state->opaque;
266

267
    g_free(state->opaque);
268
    state->opaque = NULL;
269
}
270

271
static void preallocate_reopen_abort(BDRVReopenState *state)
272
{
273
    g_free(state->opaque);
274
    state->opaque = NULL;
275
}
276

277
static int coroutine_fn GRAPH_RDLOCK
278
preallocate_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
279
                           QEMUIOVector *qiov, size_t qiov_offset,
280
                           BdrvRequestFlags flags)
281
{
282
    return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
283
                               flags);
284
}
285

286
static int coroutine_fn GRAPH_RDLOCK
287
preallocate_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
288
{
289
    return bdrv_co_pdiscard(bs->file, offset, bytes);
290
}
291

292
static bool can_write_resize(uint64_t perm)
293
{
294
    return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
295
}
296

297
static bool GRAPH_RDLOCK has_prealloc_perms(BlockDriverState *bs)
298
{
299
    BDRVPreallocateState *s = bs->opaque;
300

301
    if (can_write_resize(bs->file->perm)) {
302
        assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
303
        assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
304
        return true;
305
    }
306

307
    assert(s->data_end < 0);
308
    assert(s->zero_start < 0);
309
    assert(s->file_end < 0);
310
    return false;
311
}
312

313
/*
314
 * Call on each write. Returns true if @want_merge_zero is true and the region
315
 * [offset, offset + bytes) is zeroed (as a result of this call or earlier
316
 * preallocation).
317
 *
318
 * want_merge_zero is used to merge write-zero request with preallocation in
319
 * one bdrv_co_pwrite_zeroes() call.
320
 */
321
static bool coroutine_fn GRAPH_RDLOCK
322
handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes,
323
             bool want_merge_zero)
324
{
325
    BDRVPreallocateState *s = bs->opaque;
326
    int64_t end = offset + bytes;
327
    int64_t prealloc_start, prealloc_end;
328
    int ret;
329
    uint32_t file_align = bs->file->bs->bl.request_alignment;
330
    uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align);
331

332
    assert(QEMU_IS_ALIGNED(prealloc_align, file_align));
333

334
    if (!has_prealloc_perms(bs)) {
335
        /* We don't have state neither should try to recover it */
336
        return false;
337
    }
338

339
    if (s->data_end < 0) {
340
        s->data_end = bdrv_co_getlength(bs->file->bs);
341
        if (s->data_end < 0) {
342
            return false;
343
        }
344

345
        if (s->file_end < 0) {
346
            s->file_end = s->data_end;
347
        }
348
    }
349

350
    if (end <= s->data_end) {
351
        return false;
352
    }
353

354
    /* We have valid s->data_end, and request writes beyond it. */
355

356
    s->data_end = end;
357
    if (s->zero_start < 0 || !want_merge_zero) {
358
        s->zero_start = end;
359
    }
360

361
    if (s->file_end < 0) {
362
        s->file_end = bdrv_co_getlength(bs->file->bs);
363
        if (s->file_end < 0) {
364
            return false;
365
        }
366
    }
367

368
    /* Now s->data_end, s->zero_start and s->file_end are valid. */
369

370
    if (end <= s->file_end) {
371
        /* No preallocation needed. */
372
        return want_merge_zero && offset >= s->zero_start;
373
    }
374

375
    /* Now we want new preallocation, as request writes beyond s->file_end. */
376

377
    prealloc_start = QEMU_ALIGN_UP(
378
            want_merge_zero ? MIN(offset, s->file_end) : s->file_end,
379
            file_align);
380
    prealloc_end = QEMU_ALIGN_UP(
381
            MAX(prealloc_start, end) + s->opts.prealloc_size,
382
            prealloc_align);
383

384
    want_merge_zero = want_merge_zero && (prealloc_start <= offset);
385

386
    ret = bdrv_co_pwrite_zeroes(
387
            bs->file, prealloc_start, prealloc_end - prealloc_start,
388
            BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
389
    if (ret < 0) {
390
        s->file_end = ret;
391
        return false;
392
    }
393

394
    s->file_end = prealloc_end;
395
    return want_merge_zero;
396
}
397

398
static int coroutine_fn GRAPH_RDLOCK
399
preallocate_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
400
                             int64_t bytes, BdrvRequestFlags flags)
401
{
402
    bool want_merge_zero =
403
        !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
404
    if (handle_write(bs, offset, bytes, want_merge_zero)) {
405
        return 0;
406
    }
407

408
    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
409
}
410

411
static int coroutine_fn GRAPH_RDLOCK
412
preallocate_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
413
                            QEMUIOVector *qiov, size_t qiov_offset,
414
                            BdrvRequestFlags flags)
415
{
416
    handle_write(bs, offset, bytes, false);
417

418
    return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
419
                                flags);
420
}
421

422
static int coroutine_fn GRAPH_RDLOCK
423
preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
424
                        bool exact, PreallocMode prealloc,
425
                        BdrvRequestFlags flags, Error **errp)
426
{
427
    ERRP_GUARD();
428
    BDRVPreallocateState *s = bs->opaque;
429
    int ret;
430

431
    if (s->data_end >= 0 && offset > s->data_end) {
432
        if (s->file_end < 0) {
433
            s->file_end = bdrv_co_getlength(bs->file->bs);
434
            if (s->file_end < 0) {
435
                error_setg(errp, "failed to get file length");
436
                return s->file_end;
437
            }
438
        }
439

440
        if (prealloc == PREALLOC_MODE_FALLOC) {
441
            /*
442
             * If offset <= s->file_end, the task is already done, just
443
             * update s->data_end, to move part of "filter preallocation"
444
             * to "preallocation requested by user".
445
             * Otherwise just proceed to preallocate missing part.
446
             */
447
            if (offset <= s->file_end) {
448
                s->data_end = offset;
449
                return 0;
450
            }
451
        } else {
452
            /*
453
             * We have to drop our preallocation, to
454
             * - avoid "Cannot use preallocation for shrinking files" in
455
             *   case of offset < file_end
456
             * - give PREALLOC_MODE_OFF a chance to keep small disk
457
             *   usage
458
             * - give PREALLOC_MODE_FULL a chance to actually write the
459
             *   whole region as user expects
460
             */
461
            if (s->file_end > s->data_end) {
462
                ret = bdrv_co_truncate(bs->file, s->data_end, true,
463
                                       PREALLOC_MODE_OFF, 0, errp);
464
                if (ret < 0) {
465
                    s->file_end = ret;
466
                    error_prepend(errp, "preallocate-filter: failed to drop "
467
                                  "write-zero preallocation: ");
468
                    return ret;
469
                }
470
                s->file_end = s->data_end;
471
            }
472
        }
473

474
        s->data_end = offset;
475
    }
476

477
    ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
478
    if (ret < 0) {
479
        s->file_end = s->zero_start = s->data_end = ret;
480
        return ret;
481
    }
482

483
    if (has_prealloc_perms(bs)) {
484
        s->file_end = s->zero_start = s->data_end = offset;
485
    }
486
    return 0;
487
}
488

489
static int coroutine_fn GRAPH_RDLOCK preallocate_co_flush(BlockDriverState *bs)
490
{
491
    return bdrv_co_flush(bs->file->bs);
492
}
493

494
static int64_t coroutine_fn GRAPH_RDLOCK
495
preallocate_co_getlength(BlockDriverState *bs)
496
{
497
    int64_t ret;
498
    BDRVPreallocateState *s = bs->opaque;
499

500
    if (s->data_end >= 0) {
501
        return s->data_end;
502
    }
503

504
    ret = bdrv_co_getlength(bs->file->bs);
505

506
    if (has_prealloc_perms(bs)) {
507
        s->file_end = s->zero_start = s->data_end = ret;
508
    }
509

510
    return ret;
511
}
512

513
static int GRAPH_RDLOCK
514
preallocate_drop_resize(BlockDriverState *bs, Error **errp)
515
{
516
    BDRVPreallocateState *s = bs->opaque;
517
    int ret;
518

519
    if (s->data_end < 0) {
520
        return 0;
521
    }
522

523
    /*
524
     * Before switching children to be read-only, truncate them to remove
525
     * the preallocation and let them have the real size.
526
     */
527
    ret = preallocate_truncate_to_real_size(bs, errp);
528
    if (ret < 0) {
529
        return ret;
530
    }
531

532
    /*
533
     * We'll drop our permissions and will allow other users to take write and
534
     * resize permissions (see preallocate_child_perm). Anyone will be able to
535
     * change the child, so mark all states invalid. We'll regain control if a
536
     * parent requests write access again.
537
     */
538
    s->data_end = s->file_end = s->zero_start = -EINVAL;
539

540
    bdrv_child_refresh_perms(bs, bs->file, NULL);
541

542
    return 0;
543
}
544

545
static void preallocate_drop_resize_bh(void *opaque)
546
{
547
    GLOBAL_STATE_CODE();
548
    GRAPH_RDLOCK_GUARD_MAINLOOP();
549

550
    /*
551
     * In case of errors, we'll simply keep the exclusive lock on the image
552
     * indefinitely.
553
     */
554
    preallocate_drop_resize(opaque, NULL);
555
}
556

557
static void GRAPH_RDLOCK
558
preallocate_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
559
{
560
    BDRVPreallocateState *s = bs->opaque;
561

562
    if (can_write_resize(perm)) {
563
        qemu_bh_cancel(s->drop_resize_bh);
564
        if (s->data_end < 0) {
565
            s->data_end = s->file_end = s->zero_start =
566
                bs->file->bs->total_sectors * BDRV_SECTOR_SIZE;
567
        }
568
    } else {
569
        qemu_bh_schedule(s->drop_resize_bh);
570
    }
571
}
572

573
static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
574
    BdrvChildRole role, BlockReopenQueue *reopen_queue,
575
    uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
576
{
577
    BDRVPreallocateState *s = bs->opaque;
578

579
    bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
580

581
    /*
582
     * We need exclusive write and resize permissions on the child not only when
583
     * the parent can write to it, but also after the parent gave up write
584
     * permissions until preallocate_drop_resize() has completed.
585
     */
586
    if (can_write_resize(perm) || s->data_end != -EINVAL) {
587
        *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
588

589
        /*
590
         * Don't share, to keep our states s->file_end, s->data_end and
591
         * s->zero_start valid.
592
         */
593
        *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
594
    }
595
}
596

597
static BlockDriver bdrv_preallocate_filter = {
598
    .format_name = "preallocate",
599
    .instance_size = sizeof(BDRVPreallocateState),
600

601
    .bdrv_co_getlength    = preallocate_co_getlength,
602
    .bdrv_open            = preallocate_open,
603
    .bdrv_close           = preallocate_close,
604

605
    .bdrv_reopen_prepare  = preallocate_reopen_prepare,
606
    .bdrv_reopen_commit   = preallocate_reopen_commit,
607
    .bdrv_reopen_abort    = preallocate_reopen_abort,
608

609
    .bdrv_co_preadv_part = preallocate_co_preadv_part,
610
    .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
611
    .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
612
    .bdrv_co_pdiscard = preallocate_co_pdiscard,
613
    .bdrv_co_flush = preallocate_co_flush,
614
    .bdrv_co_truncate = preallocate_co_truncate,
615

616
    .bdrv_set_perm = preallocate_set_perm,
617
    .bdrv_child_perm = preallocate_child_perm,
618

619
    .is_filter = true,
620
};
621

622
static void bdrv_preallocate_init(void)
623
{
624
    bdrv_register(&bdrv_preallocate_filter);
625
}
626

627
block_init(bdrv_preallocate_init);
628

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.