qemu

file-posix.c
4565 строк · 130.9 Кб
Перенос по словам
1
/*
2
 * Block driver for RAW files (posix)
3
 *
4
 * Copyright (c) 2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24

25
#include "qemu/osdep.h"
26
#include "qapi/error.h"
27
#include "qemu/cutils.h"
28
#include "qemu/error-report.h"
29
#include "block/block-io.h"
30
#include "block/block_int.h"
31
#include "qemu/module.h"
32
#include "qemu/option.h"
33
#include "qemu/units.h"
34
#include "qemu/memalign.h"
35
#include "trace.h"
36
#include "block/thread-pool.h"
37
#include "qemu/iov.h"
38
#include "block/raw-aio.h"
39
#include "qapi/qmp/qdict.h"
40
#include "qapi/qmp/qstring.h"
41

42
#include "scsi/pr-manager.h"
43
#include "scsi/constants.h"
44

45
#if defined(__APPLE__) && (__MACH__)
46
#include <sys/ioctl.h>
47
#if defined(HAVE_HOST_BLOCK_DEVICE)
48
#include <paths.h>
49
#include <sys/param.h>
50
#include <sys/mount.h>
51
#include <IOKit/IOKitLib.h>
52
#include <IOKit/IOBSD.h>
53
#include <IOKit/storage/IOMediaBSDClient.h>
54
#include <IOKit/storage/IOMedia.h>
55
#include <IOKit/storage/IOCDMedia.h>
56
//#include <IOKit/storage/IOCDTypes.h>
57
#include <IOKit/storage/IODVDMedia.h>
58
#include <CoreFoundation/CoreFoundation.h>
59
#endif /* defined(HAVE_HOST_BLOCK_DEVICE) */
60
#endif
61

62
#ifdef __sun__
63
#define _POSIX_PTHREAD_SEMANTICS 1
64
#include <sys/dkio.h>
65
#endif
66
#ifdef __linux__
67
#include <sys/ioctl.h>
68
#include <sys/param.h>
69
#include <sys/syscall.h>
70
#include <sys/vfs.h>
71
#if defined(CONFIG_BLKZONED)
72
#include <linux/blkzoned.h>
73
#endif
74
#include <linux/cdrom.h>
75
#include <linux/fd.h>
76
#include <linux/fs.h>
77
#include <linux/hdreg.h>
78
#include <linux/magic.h>
79
#include <scsi/sg.h>
80
#ifdef __s390__
81
#include <asm/dasd.h>
82
#endif
83
#ifndef FS_NOCOW_FL
84
#define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
85
#endif
86
#endif
87
#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
88
#include <linux/falloc.h>
89
#endif
90
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
91
#include <sys/disk.h>
92
#include <sys/cdio.h>
93
#endif
94

95
#ifdef __OpenBSD__
96
#include <sys/ioctl.h>
97
#include <sys/disklabel.h>
98
#include <sys/dkio.h>
99
#endif
100

101
#ifdef __NetBSD__
102
#include <sys/ioctl.h>
103
#include <sys/disklabel.h>
104
#include <sys/dkio.h>
105
#include <sys/disk.h>
106
#endif
107

108
#ifdef __DragonFly__
109
#include <sys/ioctl.h>
110
#include <sys/diskslice.h>
111
#endif
112

113
/* OS X does not have O_DSYNC */
114
#ifndef O_DSYNC
115
#ifdef O_SYNC
116
#define O_DSYNC O_SYNC
117
#elif defined(O_FSYNC)
118
#define O_DSYNC O_FSYNC
119
#endif
120
#endif
121

122
/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
123
#ifndef O_DIRECT
124
#define O_DIRECT O_DSYNC
125
#endif
126

127
#define FTYPE_FILE   0
128
#define FTYPE_CD     1
129

130
#define MAX_BLOCKSIZE	4096
131

132
/* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
133
 * leaving a few more bytes for its future use. */
134
#define RAW_LOCK_PERM_BASE             100
135
#define RAW_LOCK_SHARED_BASE           200
136

137
typedef struct BDRVRawState {
138
    int fd;
139
    bool use_lock;
140
    int type;
141
    int open_flags;
142
    size_t buf_align;
143

144
    /* The current permissions. */
145
    uint64_t perm;
146
    uint64_t shared_perm;
147

148
    /* The perms bits whose corresponding bytes are already locked in
149
     * s->fd. */
150
    uint64_t locked_perm;
151
    uint64_t locked_shared_perm;
152

153
    uint64_t aio_max_batch;
154

155
    int perm_change_fd;
156
    int perm_change_flags;
157
    BDRVReopenState *reopen_state;
158

159
    bool has_discard:1;
160
    bool has_write_zeroes:1;
161
    bool use_linux_aio:1;
162
    bool has_laio_fdsync:1;
163
    bool use_linux_io_uring:1;
164
    int page_cache_inconsistent; /* errno from fdatasync failure */
165
    bool has_fallocate;
166
    bool needs_alignment;
167
    bool force_alignment;
168
    bool drop_cache;
169
    bool check_cache_dropped;
170
    struct {
171
        uint64_t discard_nb_ok;
172
        uint64_t discard_nb_failed;
173
        uint64_t discard_bytes_ok;
174
    } stats;
175

176
    PRManager *pr_mgr;
177
} BDRVRawState;
178

179
typedef struct BDRVRawReopenState {
180
    int open_flags;
181
    bool drop_cache;
182
    bool check_cache_dropped;
183
} BDRVRawReopenState;
184

185
static int fd_open(BlockDriverState *bs)
186
{
187
    BDRVRawState *s = bs->opaque;
188

189
    /* this is just to ensure s->fd is sane (its called by io ops) */
190
    if (s->fd >= 0) {
191
        return 0;
192
    }
193
    return -EIO;
194
}
195

196
static int64_t raw_getlength(BlockDriverState *bs);
197

198
typedef struct RawPosixAIOData {
199
    BlockDriverState *bs;
200
    int aio_type;
201
    int aio_fildes;
202

203
    off_t aio_offset;
204
    uint64_t aio_nbytes;
205

206
    union {
207
        struct {
208
            struct iovec *iov;
209
            int niov;
210
        } io;
211
        struct {
212
            uint64_t cmd;
213
            void *buf;
214
        } ioctl;
215
        struct {
216
            int aio_fd2;
217
            off_t aio_offset2;
218
        } copy_range;
219
        struct {
220
            PreallocMode prealloc;
221
            Error **errp;
222
        } truncate;
223
        struct {
224
            unsigned int *nr_zones;
225
            BlockZoneDescriptor *zones;
226
        } zone_report;
227
        struct {
228
            unsigned long op;
229
        } zone_mgmt;
230
    };
231
} RawPosixAIOData;
232

233
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
234
static int cdrom_reopen(BlockDriverState *bs);
235
#endif
236

237
/*
238
 * Elide EAGAIN and EACCES details when failing to lock, as this
239
 * indicates that the specified file region is already locked by
240
 * another process, which is considered a common scenario.
241
 */
242
#define raw_lock_error_setg_errno(errp, err, fmt, ...)                  \
243
    do {                                                                \
244
        if ((err) == EAGAIN || (err) == EACCES) {                       \
245
            error_setg((errp), (fmt), ## __VA_ARGS__);                  \
246
        } else {                                                        \
247
            error_setg_errno((errp), (err), (fmt), ## __VA_ARGS__);     \
248
        }                                                               \
249
    } while (0)
250

251
#if defined(__NetBSD__)
252
static int raw_normalize_devicepath(const char **filename, Error **errp)
253
{
254
    static char namebuf[PATH_MAX];
255
    const char *dp, *fname;
256
    struct stat sb;
257

258
    fname = *filename;
259
    dp = strrchr(fname, '/');
260
    if (lstat(fname, &sb) < 0) {
261
        error_setg_file_open(errp, errno, fname);
262
        return -errno;
263
    }
264

265
    if (!S_ISBLK(sb.st_mode)) {
266
        return 0;
267
    }
268

269
    if (dp == NULL) {
270
        snprintf(namebuf, PATH_MAX, "r%s", fname);
271
    } else {
272
        snprintf(namebuf, PATH_MAX, "%.*s/r%s",
273
            (int)(dp - fname), fname, dp + 1);
274
    }
275
    *filename = namebuf;
276
    warn_report("%s is a block device, using %s", fname, *filename);
277

278
    return 0;
279
}
280
#else
281
static int raw_normalize_devicepath(const char **filename, Error **errp)
282
{
283
    return 0;
284
}
285
#endif
286

287
/*
288
 * Get logical block size via ioctl. On success store it in @sector_size_p.
289
 */
290
static int probe_logical_blocksize(int fd, unsigned int *sector_size_p)
291
{
292
    unsigned int sector_size;
293
    bool success = false;
294
    int i;
295

296
    errno = ENOTSUP;
297
    static const unsigned long ioctl_list[] = {
298
#ifdef BLKSSZGET
299
        BLKSSZGET,
300
#endif
301
#ifdef DKIOCGETBLOCKSIZE
302
        DKIOCGETBLOCKSIZE,
303
#endif
304
#ifdef DIOCGSECTORSIZE
305
        DIOCGSECTORSIZE,
306
#endif
307
    };
308

309
    /* Try a few ioctls to get the right size */
310
    for (i = 0; i < (int)ARRAY_SIZE(ioctl_list); i++) {
311
        if (ioctl(fd, ioctl_list[i], &sector_size) >= 0) {
312
            *sector_size_p = sector_size;
313
            success = true;
314
        }
315
    }
316

317
    return success ? 0 : -errno;
318
}
319

320
/**
321
 * Get physical block size of @fd.
322
 * On success, store it in @blk_size and return 0.
323
 * On failure, return -errno.
324
 */
325
static int probe_physical_blocksize(int fd, unsigned int *blk_size)
326
{
327
#ifdef BLKPBSZGET
328
    if (ioctl(fd, BLKPBSZGET, blk_size) < 0) {
329
        return -errno;
330
    }
331
    return 0;
332
#else
333
    return -ENOTSUP;
334
#endif
335
}
336

337
/*
338
 * Returns true if no alignment restrictions are necessary even for files
339
 * opened with O_DIRECT.
340
 *
341
 * raw_probe_alignment() probes the required alignment and assume that 1 means
342
 * the probing failed, so it falls back to a safe default of 4k. This can be
343
 * avoided if we know that byte alignment is okay for the file.
344
 */
345
static bool dio_byte_aligned(int fd)
346
{
347
#ifdef __linux__
348
    struct statfs buf;
349
    int ret;
350

351
    ret = fstatfs(fd, &buf);
352
    if (ret == 0 && buf.f_type == NFS_SUPER_MAGIC) {
353
        return true;
354
    }
355
#endif
356
    return false;
357
}
358

359
static bool raw_needs_alignment(BlockDriverState *bs)
360
{
361
    BDRVRawState *s = bs->opaque;
362

363
    if ((bs->open_flags & BDRV_O_NOCACHE) != 0 && !dio_byte_aligned(s->fd)) {
364
        return true;
365
    }
366

367
    return s->force_alignment;
368
}
369

370
/* Check if read is allowed with given memory buffer and length.
371
 *
372
 * This function is used to check O_DIRECT memory buffer and request alignment.
373
 */
374
static bool raw_is_io_aligned(int fd, void *buf, size_t len)
375
{
376
    ssize_t ret = pread(fd, buf, len, 0);
377

378
    if (ret >= 0) {
379
        return true;
380
    }
381

382
#ifdef __linux__
383
    /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads.  Ignore
384
     * other errors (e.g. real I/O error), which could happen on a failed
385
     * drive, since we only care about probing alignment.
386
     */
387
    if (errno != EINVAL) {
388
        return true;
389
    }
390
#endif
391

392
    return false;
393
}
394

395
static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
396
{
397
    BDRVRawState *s = bs->opaque;
398
    char *buf;
399
    size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size());
400
    size_t alignments[] = {1, 512, 1024, 2048, 4096};
401

402
    /* For SCSI generic devices the alignment is not really used.
403
       With buffered I/O, we don't have any restrictions. */
404
    if (bdrv_is_sg(bs) || !s->needs_alignment) {
405
        bs->bl.request_alignment = 1;
406
        s->buf_align = 1;
407
        return;
408
    }
409

410
    bs->bl.request_alignment = 0;
411
    s->buf_align = 0;
412
    /* Let's try to use the logical blocksize for the alignment. */
413
    if (probe_logical_blocksize(fd, &bs->bl.request_alignment) < 0) {
414
        bs->bl.request_alignment = 0;
415
    }
416

417
#ifdef __linux__
418
    /*
419
     * The XFS ioctl definitions are shipped in extra packages that might
420
     * not always be available. Since we just need the XFS_IOC_DIOINFO ioctl
421
     * here, we simply use our own definition instead:
422
     */
423
    struct xfs_dioattr {
424
        uint32_t d_mem;
425
        uint32_t d_miniosz;
426
        uint32_t d_maxiosz;
427
    } da;
428
    if (ioctl(fd, _IOR('X', 30, struct xfs_dioattr), &da) >= 0) {
429
        bs->bl.request_alignment = da.d_miniosz;
430
        /* The kernel returns wrong information for d_mem */
431
        /* s->buf_align = da.d_mem; */
432
    }
433
#endif
434

435
    /*
436
     * If we could not get the sizes so far, we can only guess them. First try
437
     * to detect request alignment, since it is more likely to succeed. Then
438
     * try to detect buf_align, which cannot be detected in some cases (e.g.
439
     * Gluster). If buf_align cannot be detected, we fallback to the value of
440
     * request_alignment.
441
     */
442

443
    if (!bs->bl.request_alignment) {
444
        int i;
445
        size_t align;
446
        buf = qemu_memalign(max_align, max_align);
447
        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
448
            align = alignments[i];
449
            if (raw_is_io_aligned(fd, buf, align)) {
450
                /* Fallback to safe value. */
451
                bs->bl.request_alignment = (align != 1) ? align : max_align;
452
                break;
453
            }
454
        }
455
        qemu_vfree(buf);
456
    }
457

458
    if (!s->buf_align) {
459
        int i;
460
        size_t align;
461
        buf = qemu_memalign(max_align, 2 * max_align);
462
        for (i = 0; i < ARRAY_SIZE(alignments); i++) {
463
            align = alignments[i];
464
            if (raw_is_io_aligned(fd, buf + align, max_align)) {
465
                /* Fallback to request_alignment. */
466
                s->buf_align = (align != 1) ? align : bs->bl.request_alignment;
467
                break;
468
            }
469
        }
470
        qemu_vfree(buf);
471
    }
472

473
    if (!s->buf_align || !bs->bl.request_alignment) {
474
        error_setg(errp, "Could not find working O_DIRECT alignment");
475
        error_append_hint(errp, "Try cache.direct=off\n");
476
    }
477
}
478

479
static int check_hdev_writable(int fd)
480
{
481
#if defined(BLKROGET)
482
    /* Linux block devices can be configured "read-only" using blockdev(8).
483
     * This is independent of device node permissions and therefore open(2)
484
     * with O_RDWR succeeds.  Actual writes fail with EPERM.
485
     *
486
     * bdrv_open() is supposed to fail if the disk is read-only.  Explicitly
487
     * check for read-only block devices so that Linux block devices behave
488
     * properly.
489
     */
490
    struct stat st;
491
    int readonly = 0;
492

493
    if (fstat(fd, &st)) {
494
        return -errno;
495
    }
496

497
    if (!S_ISBLK(st.st_mode)) {
498
        return 0;
499
    }
500

501
    if (ioctl(fd, BLKROGET, &readonly) < 0) {
502
        return -errno;
503
    }
504

505
    if (readonly) {
506
        return -EACCES;
507
    }
508
#endif /* defined(BLKROGET) */
509
    return 0;
510
}
511

512
static void raw_parse_flags(int bdrv_flags, int *open_flags, bool has_writers)
513
{
514
    bool read_write = false;
515
    assert(open_flags != NULL);
516

517
    *open_flags |= O_BINARY;
518
    *open_flags &= ~O_ACCMODE;
519

520
    if (bdrv_flags & BDRV_O_AUTO_RDONLY) {
521
        read_write = has_writers;
522
    } else if (bdrv_flags & BDRV_O_RDWR) {
523
        read_write = true;
524
    }
525

526
    if (read_write) {
527
        *open_flags |= O_RDWR;
528
    } else {
529
        *open_flags |= O_RDONLY;
530
    }
531

532
    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
533
     * and O_DIRECT for no caching. */
534
    if ((bdrv_flags & BDRV_O_NOCACHE)) {
535
        *open_flags |= O_DIRECT;
536
    }
537
}
538

539
static void raw_parse_filename(const char *filename, QDict *options,
540
                               Error **errp)
541
{
542
    bdrv_parse_filename_strip_prefix(filename, "file:", options);
543
}
544

545
static QemuOptsList raw_runtime_opts = {
546
    .name = "raw",
547
    .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
548
    .desc = {
549
        {
550
            .name = "filename",
551
            .type = QEMU_OPT_STRING,
552
            .help = "File name of the image",
553
        },
554
        {
555
            .name = "aio",
556
            .type = QEMU_OPT_STRING,
557
            .help = "host AIO implementation (threads, native, io_uring)",
558
        },
559
        {
560
            .name = "aio-max-batch",
561
            .type = QEMU_OPT_NUMBER,
562
            .help = "AIO max batch size (0 = auto handled by AIO backend, default: 0)",
563
        },
564
        {
565
            .name = "locking",
566
            .type = QEMU_OPT_STRING,
567
            .help = "file locking mode (on/off/auto, default: auto)",
568
        },
569
        {
570
            .name = "pr-manager",
571
            .type = QEMU_OPT_STRING,
572
            .help = "id of persistent reservation manager object (default: none)",
573
        },
574
#if defined(__linux__)
575
        {
576
            .name = "drop-cache",
577
            .type = QEMU_OPT_BOOL,
578
            .help = "invalidate page cache during live migration (default: on)",
579
        },
580
#endif
581
        {
582
            .name = "x-check-cache-dropped",
583
            .type = QEMU_OPT_BOOL,
584
            .help = "check that page cache was dropped on live migration (default: off)"
585
        },
586
        { /* end of list */ }
587
    },
588
};
589

590
static const char *const mutable_opts[] = { "x-check-cache-dropped", NULL };
591

592
static int raw_open_common(BlockDriverState *bs, QDict *options,
593
                           int bdrv_flags, int open_flags,
594
                           bool device, Error **errp)
595
{
596
    BDRVRawState *s = bs->opaque;
597
    QemuOpts *opts;
598
    Error *local_err = NULL;
599
    const char *filename = NULL;
600
    const char *str;
601
    BlockdevAioOptions aio, aio_default;
602
    int fd, ret;
603
    struct stat st;
604
    OnOffAuto locking;
605

606
    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
607
    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
608
        ret = -EINVAL;
609
        goto fail;
610
    }
611

612
    filename = qemu_opt_get(opts, "filename");
613

614
    ret = raw_normalize_devicepath(&filename, errp);
615
    if (ret != 0) {
616
        goto fail;
617
    }
618

619
    if (bdrv_flags & BDRV_O_NATIVE_AIO) {
620
        aio_default = BLOCKDEV_AIO_OPTIONS_NATIVE;
621
#ifdef CONFIG_LINUX_IO_URING
622
    } else if (bdrv_flags & BDRV_O_IO_URING) {
623
        aio_default = BLOCKDEV_AIO_OPTIONS_IO_URING;
624
#endif
625
    } else {
626
        aio_default = BLOCKDEV_AIO_OPTIONS_THREADS;
627
    }
628

629
    aio = qapi_enum_parse(&BlockdevAioOptions_lookup,
630
                          qemu_opt_get(opts, "aio"),
631
                          aio_default, &local_err);
632
    if (local_err) {
633
        error_propagate(errp, local_err);
634
        ret = -EINVAL;
635
        goto fail;
636
    }
637

638
    s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
639
#ifdef CONFIG_LINUX_IO_URING
640
    s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
641
#endif
642

643
    s->aio_max_batch = qemu_opt_get_number(opts, "aio-max-batch", 0);
644

645
    locking = qapi_enum_parse(&OnOffAuto_lookup,
646
                              qemu_opt_get(opts, "locking"),
647
                              ON_OFF_AUTO_AUTO, &local_err);
648
    if (local_err) {
649
        error_propagate(errp, local_err);
650
        ret = -EINVAL;
651
        goto fail;
652
    }
653
    switch (locking) {
654
    case ON_OFF_AUTO_ON:
655
        s->use_lock = true;
656
        if (!qemu_has_ofd_lock()) {
657
            warn_report("File lock requested but OFD locking syscall is "
658
                        "unavailable, falling back to POSIX file locks");
659
            error_printf("Due to the implementation, locks can be lost "
660
                         "unexpectedly.\n");
661
        }
662
        break;
663
    case ON_OFF_AUTO_OFF:
664
        s->use_lock = false;
665
        break;
666
    case ON_OFF_AUTO_AUTO:
667
        s->use_lock = qemu_has_ofd_lock();
668
        break;
669
    default:
670
        abort();
671
    }
672

673
    str = qemu_opt_get(opts, "pr-manager");
674
    if (str) {
675
        s->pr_mgr = pr_manager_lookup(str, &local_err);
676
        if (local_err) {
677
            error_propagate(errp, local_err);
678
            ret = -EINVAL;
679
            goto fail;
680
        }
681
    }
682

683
    s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
684
    s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
685
                                               false);
686

687
    s->open_flags = open_flags;
688
    raw_parse_flags(bdrv_flags, &s->open_flags, false);
689

690
    s->fd = -1;
691
    fd = qemu_open(filename, s->open_flags, errp);
692
    ret = fd < 0 ? -errno : 0;
693

694
    if (ret < 0) {
695
        if (ret == -EROFS) {
696
            ret = -EACCES;
697
        }
698
        goto fail;
699
    }
700
    s->fd = fd;
701

702
    /* Check s->open_flags rather than bdrv_flags due to auto-read-only */
703
    if (s->open_flags & O_RDWR) {
704
        ret = check_hdev_writable(s->fd);
705
        if (ret < 0) {
706
            error_setg_errno(errp, -ret, "The device is not writable");
707
            goto fail;
708
        }
709
    }
710

711
    s->perm = 0;
712
    s->shared_perm = BLK_PERM_ALL;
713

714
#ifdef CONFIG_LINUX_AIO
715
     /* Currently Linux does AIO only for files opened with O_DIRECT */
716
    if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) {
717
        error_setg(errp, "aio=native was specified, but it requires "
718
                         "cache.direct=on, which was not specified.");
719
        ret = -EINVAL;
720
        goto fail;
721
    }
722
    if (s->use_linux_aio) {
723
        s->has_laio_fdsync = laio_has_fdsync(s->fd);
724
    }
725
#else
726
    if (s->use_linux_aio) {
727
        error_setg(errp, "aio=native was specified, but is not supported "
728
                         "in this build.");
729
        ret = -EINVAL;
730
        goto fail;
731
    }
732
#endif /* !defined(CONFIG_LINUX_AIO) */
733

734
#ifndef CONFIG_LINUX_IO_URING
735
    if (s->use_linux_io_uring) {
736
        error_setg(errp, "aio=io_uring was specified, but is not supported "
737
                         "in this build.");
738
        ret = -EINVAL;
739
        goto fail;
740
    }
741
#endif /* !defined(CONFIG_LINUX_IO_URING) */
742

743
    s->has_discard = true;
744
    s->has_write_zeroes = true;
745

746
    if (fstat(s->fd, &st) < 0) {
747
        ret = -errno;
748
        error_setg_errno(errp, errno, "Could not stat file");
749
        goto fail;
750
    }
751

752
    if (!device) {
753
        if (!S_ISREG(st.st_mode)) {
754
            error_setg(errp, "'%s' driver requires '%s' to be a regular file",
755
                       bs->drv->format_name, bs->filename);
756
            ret = -EINVAL;
757
            goto fail;
758
        } else {
759
            s->has_fallocate = true;
760
        }
761
    } else {
762
        if (!(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
763
            error_setg(errp, "'%s' driver requires '%s' to be either "
764
                       "a character or block device",
765
                       bs->drv->format_name, bs->filename);
766
            ret = -EINVAL;
767
            goto fail;
768
        }
769
    }
770
#ifdef CONFIG_BLKZONED
771
    /*
772
     * The kernel page cache does not reliably work for writes to SWR zones
773
     * of zoned block device because it can not guarantee the order of writes.
774
     */
775
    if ((bs->bl.zoned != BLK_Z_NONE) &&
776
        (!(s->open_flags & O_DIRECT))) {
777
        error_setg(errp, "The driver supports zoned devices, and it requires "
778
                         "cache.direct=on, which was not specified.");
779
        return -EINVAL; /* No host kernel page cache */
780
    }
781
#endif
782

783
    if (S_ISBLK(st.st_mode)) {
784
#ifdef __linux__
785
        /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
786
         * not rely on the contents of discarded blocks unless using O_DIRECT.
787
         * Same for BLKZEROOUT.
788
         */
789
        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
790
            s->has_write_zeroes = false;
791
        }
792
#endif
793
    }
794
#ifdef __FreeBSD__
795
    if (S_ISCHR(st.st_mode)) {
796
        /*
797
         * The file is a char device (disk), which on FreeBSD isn't behind
798
         * a pager, so force all requests to be aligned. This is needed
799
         * so QEMU makes sure all IO operations on the device are aligned
800
         * to sector size, or else FreeBSD will reject them with EINVAL.
801
         */
802
        s->force_alignment = true;
803
    }
804
#endif
805
    s->needs_alignment = raw_needs_alignment(bs);
806

807
    bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
808
    if (S_ISREG(st.st_mode)) {
809
        /* When extending regular files, we get zeros from the OS */
810
        bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
811
    }
812
    ret = 0;
813
fail:
814
    if (ret < 0 && s->fd != -1) {
815
        qemu_close(s->fd);
816
    }
817
    if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
818
        unlink(filename);
819
    }
820
    qemu_opts_del(opts);
821
    return ret;
822
}
823

824
static int raw_open(BlockDriverState *bs, QDict *options, int flags,
825
                    Error **errp)
826
{
827
    BDRVRawState *s = bs->opaque;
828

829
    s->type = FTYPE_FILE;
830
    return raw_open_common(bs, options, flags, 0, false, errp);
831
}
832

833
typedef enum {
834
    RAW_PL_PREPARE,
835
    RAW_PL_COMMIT,
836
    RAW_PL_ABORT,
837
} RawPermLockOp;
838

839
#define PERM_FOREACH(i) \
840
    for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
841

842
/* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
843
 * file; if @unlock == true, also unlock the unneeded bytes.
844
 * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
845
 */
846
static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
847
                                uint64_t perm_lock_bits,
848
                                uint64_t shared_perm_lock_bits,
849
                                bool unlock, Error **errp)
850
{
851
    int ret;
852
    int i;
853
    uint64_t locked_perm, locked_shared_perm;
854

855
    if (s) {
856
        locked_perm = s->locked_perm;
857
        locked_shared_perm = s->locked_shared_perm;
858
    } else {
859
        /*
860
         * We don't have the previous bits, just lock/unlock for each of the
861
         * requested bits.
862
         */
863
        if (unlock) {
864
            locked_perm = BLK_PERM_ALL;
865
            locked_shared_perm = BLK_PERM_ALL;
866
        } else {
867
            locked_perm = 0;
868
            locked_shared_perm = 0;
869
        }
870
    }
871

872
    PERM_FOREACH(i) {
873
        int off = RAW_LOCK_PERM_BASE + i;
874
        uint64_t bit = (1ULL << i);
875
        if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
876
            ret = qemu_lock_fd(fd, off, 1, false);
877
            if (ret) {
878
                raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
879
                                          off);
880
                return ret;
881
            } else if (s) {
882
                s->locked_perm |= bit;
883
            }
884
        } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
885
            ret = qemu_unlock_fd(fd, off, 1);
886
            if (ret) {
887
                error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
888
                return ret;
889
            } else if (s) {
890
                s->locked_perm &= ~bit;
891
            }
892
        }
893
    }
894
    PERM_FOREACH(i) {
895
        int off = RAW_LOCK_SHARED_BASE + i;
896
        uint64_t bit = (1ULL << i);
897
        if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
898
            ret = qemu_lock_fd(fd, off, 1, false);
899
            if (ret) {
900
                raw_lock_error_setg_errno(errp, -ret, "Failed to lock byte %d",
901
                                          off);
902
                return ret;
903
            } else if (s) {
904
                s->locked_shared_perm |= bit;
905
            }
906
        } else if (unlock && (locked_shared_perm & bit) &&
907
                   !(shared_perm_lock_bits & bit)) {
908
            ret = qemu_unlock_fd(fd, off, 1);
909
            if (ret) {
910
                error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
911
                return ret;
912
            } else if (s) {
913
                s->locked_shared_perm &= ~bit;
914
            }
915
        }
916
    }
917
    return 0;
918
}
919

920
/* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
921
static int raw_check_lock_bytes(int fd, uint64_t perm, uint64_t shared_perm,
922
                                Error **errp)
923
{
924
    int ret;
925
    int i;
926

927
    PERM_FOREACH(i) {
928
        int off = RAW_LOCK_SHARED_BASE + i;
929
        uint64_t p = 1ULL << i;
930
        if (perm & p) {
931
            ret = qemu_lock_fd_test(fd, off, 1, true);
932
            if (ret) {
933
                char *perm_name = bdrv_perm_names(p);
934

935
                raw_lock_error_setg_errno(errp, -ret,
936
                                          "Failed to get \"%s\" lock",
937
                                          perm_name);
938
                g_free(perm_name);
939
                return ret;
940
            }
941
        }
942
    }
943
    PERM_FOREACH(i) {
944
        int off = RAW_LOCK_PERM_BASE + i;
945
        uint64_t p = 1ULL << i;
946
        if (!(shared_perm & p)) {
947
            ret = qemu_lock_fd_test(fd, off, 1, true);
948
            if (ret) {
949
                char *perm_name = bdrv_perm_names(p);
950

951
                raw_lock_error_setg_errno(errp, -ret,
952
                                          "Failed to get shared \"%s\" lock",
953
                                          perm_name);
954
                g_free(perm_name);
955
                return ret;
956
            }
957
        }
958
    }
959
    return 0;
960
}
961

962
static int raw_handle_perm_lock(BlockDriverState *bs,
963
                                RawPermLockOp op,
964
                                uint64_t new_perm, uint64_t new_shared,
965
                                Error **errp)
966
{
967
    BDRVRawState *s = bs->opaque;
968
    int ret = 0;
969
    Error *local_err = NULL;
970

971
    if (!s->use_lock) {
972
        return 0;
973
    }
974

975
    if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
976
        return 0;
977
    }
978

979
    switch (op) {
980
    case RAW_PL_PREPARE:
981
        if ((s->perm | new_perm) == s->perm &&
982
            (s->shared_perm & new_shared) == s->shared_perm)
983
        {
984
            /*
985
             * We are going to unlock bytes, it should not fail. If it fail due
986
             * to some fs-dependent permission-unrelated reasons (which occurs
987
             * sometimes on NFS and leads to abort in bdrv_replace_child) we
988
             * can't prevent such errors by any check here. And we ignore them
989
             * anyway in ABORT and COMMIT.
990
             */
991
            return 0;
992
        }
993
        ret = raw_apply_lock_bytes(s, s->fd, s->perm | new_perm,
994
                                   ~s->shared_perm | ~new_shared,
995
                                   false, errp);
996
        if (!ret) {
997
            ret = raw_check_lock_bytes(s->fd, new_perm, new_shared, errp);
998
            if (!ret) {
999
                return 0;
1000
            }
1001
            error_append_hint(errp,
1002
                              "Is another process using the image [%s]?\n",
1003
                              bs->filename);
1004
        }
1005
        /* fall through to unlock bytes. */
1006
    case RAW_PL_ABORT:
1007
        raw_apply_lock_bytes(s, s->fd, s->perm, ~s->shared_perm,
1008
                             true, &local_err);
1009
        if (local_err) {
1010
            /* Theoretically the above call only unlocks bytes and it cannot
1011
             * fail. Something weird happened, report it.
1012
             */
1013
            warn_report_err(local_err);
1014
        }
1015
        break;
1016
    case RAW_PL_COMMIT:
1017
        raw_apply_lock_bytes(s, s->fd, new_perm, ~new_shared,
1018
                             true, &local_err);
1019
        if (local_err) {
1020
            /* Theoretically the above call only unlocks bytes and it cannot
1021
             * fail. Something weird happened, report it.
1022
             */
1023
            warn_report_err(local_err);
1024
        }
1025
        break;
1026
    }
1027
    return ret;
1028
}
1029

1030
/* Sets a specific flag */
1031
static int fcntl_setfl(int fd, int flag)
1032
{
1033
    int flags;
1034

1035
    flags = fcntl(fd, F_GETFL);
1036
    if (flags == -1) {
1037
        return -errno;
1038
    }
1039
    if (fcntl(fd, F_SETFL, flags | flag) == -1) {
1040
        return -errno;
1041
    }
1042
    return 0;
1043
}
1044

1045
static int raw_reconfigure_getfd(BlockDriverState *bs, int flags,
1046
                                 int *open_flags, uint64_t perm, Error **errp)
1047
{
1048
    BDRVRawState *s = bs->opaque;
1049
    int fd = -1;
1050
    int ret;
1051
    bool has_writers = perm &
1052
        (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED | BLK_PERM_RESIZE);
1053
    int fcntl_flags = O_APPEND | O_NONBLOCK;
1054
#ifdef O_NOATIME
1055
    fcntl_flags |= O_NOATIME;
1056
#endif
1057

1058
    *open_flags = 0;
1059
    if (s->type == FTYPE_CD) {
1060
        *open_flags |= O_NONBLOCK;
1061
    }
1062

1063
    raw_parse_flags(flags, open_flags, has_writers);
1064

1065
#ifdef O_ASYNC
1066
    /* Not all operating systems have O_ASYNC, and those that don't
1067
     * will not let us track the state into rs->open_flags (typically
1068
     * you achieve the same effect with an ioctl, for example I_SETSIG
1069
     * on Solaris). But we do not use O_ASYNC, so that's fine.
1070
     */
1071
    assert((s->open_flags & O_ASYNC) == 0);
1072
#endif
1073

1074
    if (*open_flags == s->open_flags) {
1075
        /* We're lucky, the existing fd is fine */
1076
        return s->fd;
1077
    }
1078

1079
    if ((*open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
1080
        /* dup the original fd */
1081
        fd = qemu_dup(s->fd);
1082
        if (fd >= 0) {
1083
            ret = fcntl_setfl(fd, *open_flags);
1084
            if (ret) {
1085
                qemu_close(fd);
1086
                fd = -1;
1087
            }
1088
        }
1089
    }
1090

1091
    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
1092
    if (fd == -1) {
1093
        const char *normalized_filename = bs->filename;
1094
        ret = raw_normalize_devicepath(&normalized_filename, errp);
1095
        if (ret >= 0) {
1096
            fd = qemu_open(normalized_filename, *open_flags, errp);
1097
            if (fd == -1) {
1098
                return -1;
1099
            }
1100
        }
1101
    }
1102

1103
    if (fd != -1 && (*open_flags & O_RDWR)) {
1104
        ret = check_hdev_writable(fd);
1105
        if (ret < 0) {
1106
            qemu_close(fd);
1107
            error_setg_errno(errp, -ret, "The device is not writable");
1108
            return -1;
1109
        }
1110
    }
1111

1112
    return fd;
1113
}
1114

1115
static int raw_reopen_prepare(BDRVReopenState *state,
1116
                              BlockReopenQueue *queue, Error **errp)
1117
{
1118
    BDRVRawState *s;
1119
    BDRVRawReopenState *rs;
1120
    QemuOpts *opts;
1121
    int ret;
1122

1123
    assert(state != NULL);
1124
    assert(state->bs != NULL);
1125

1126
    s = state->bs->opaque;
1127

1128
    state->opaque = g_new0(BDRVRawReopenState, 1);
1129
    rs = state->opaque;
1130

1131
    /* Handle options changes */
1132
    opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
1133
    if (!qemu_opts_absorb_qdict(opts, state->options, errp)) {
1134
        ret = -EINVAL;
1135
        goto out;
1136
    }
1137

1138
    rs->drop_cache = qemu_opt_get_bool_del(opts, "drop-cache", true);
1139
    rs->check_cache_dropped =
1140
        qemu_opt_get_bool_del(opts, "x-check-cache-dropped", false);
1141

1142
    /* This driver's reopen function doesn't currently allow changing
1143
     * other options, so let's put them back in the original QDict and
1144
     * bdrv_reopen_prepare() will detect changes and complain. */
1145
    qemu_opts_to_qdict(opts, state->options);
1146

1147
    /*
1148
     * As part of reopen prepare we also want to create new fd by
1149
     * raw_reconfigure_getfd(). But it wants updated "perm", when in
1150
     * bdrv_reopen_multiple() .bdrv_reopen_prepare() callback called prior to
1151
     * permission update. Happily, permission update is always a part
1152
     * (a separate stage) of bdrv_reopen_multiple() so we can rely on this
1153
     * fact and reconfigure fd in raw_check_perm().
1154
     */
1155

1156
    s->reopen_state = state;
1157
    ret = 0;
1158

1159
out:
1160
    qemu_opts_del(opts);
1161
    return ret;
1162
}
1163

1164
static void raw_reopen_commit(BDRVReopenState *state)
1165
{
1166
    BDRVRawReopenState *rs = state->opaque;
1167
    BDRVRawState *s = state->bs->opaque;
1168

1169
    s->drop_cache = rs->drop_cache;
1170
    s->check_cache_dropped = rs->check_cache_dropped;
1171
    s->open_flags = rs->open_flags;
1172
    g_free(state->opaque);
1173
    state->opaque = NULL;
1174

1175
    assert(s->reopen_state == state);
1176
    s->reopen_state = NULL;
1177
}
1178

1179

1180
static void raw_reopen_abort(BDRVReopenState *state)
1181
{
1182
    BDRVRawReopenState *rs = state->opaque;
1183
    BDRVRawState *s = state->bs->opaque;
1184

1185
     /* nothing to do if NULL, we didn't get far enough */
1186
    if (rs == NULL) {
1187
        return;
1188
    }
1189

1190
    g_free(state->opaque);
1191
    state->opaque = NULL;
1192

1193
    assert(s->reopen_state == state);
1194
    s->reopen_state = NULL;
1195
}
1196

1197
static int hdev_get_max_hw_transfer(int fd, struct stat *st)
1198
{
1199
#ifdef BLKSECTGET
1200
    if (S_ISBLK(st->st_mode)) {
1201
        unsigned short max_sectors = 0;
1202
        if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
1203
            return max_sectors * 512;
1204
        }
1205
    } else {
1206
        int max_bytes = 0;
1207
        if (ioctl(fd, BLKSECTGET, &max_bytes) == 0) {
1208
            return max_bytes;
1209
        }
1210
    }
1211
    return -errno;
1212
#else
1213
    return -ENOSYS;
1214
#endif
1215
}
1216

1217
/*
1218
 * Get a sysfs attribute value as character string.
1219
 */
1220
#ifdef CONFIG_LINUX
1221
static int get_sysfs_str_val(struct stat *st, const char *attribute,
1222
                             char **val) {
1223
    g_autofree char *sysfspath = NULL;
1224
    size_t len;
1225

1226
    if (!S_ISBLK(st->st_mode)) {
1227
        return -ENOTSUP;
1228
    }
1229

1230
    sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
1231
                                major(st->st_rdev), minor(st->st_rdev),
1232
                                attribute);
1233
    if (!g_file_get_contents(sysfspath, val, &len, NULL)) {
1234
        return -ENOENT;
1235
    }
1236

1237
    /* The file is ended with '\n' */
1238
    char *p;
1239
    p = *val;
1240
    if (*(p + len - 1) == '\n') {
1241
        *(p + len - 1) = '\0';
1242
    }
1243
    return 0;
1244
}
1245
#endif
1246

1247
#if defined(CONFIG_BLKZONED)
1248
static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
1249
{
1250
    g_autofree char *val = NULL;
1251
    int ret;
1252

1253
    ret = get_sysfs_str_val(st, "zoned", &val);
1254
    if (ret < 0) {
1255
        return ret;
1256
    }
1257

1258
    if (strcmp(val, "host-managed") == 0) {
1259
        *zoned = BLK_Z_HM;
1260
    } else if (strcmp(val, "host-aware") == 0) {
1261
        *zoned = BLK_Z_HA;
1262
    } else if (strcmp(val, "none") == 0) {
1263
        *zoned = BLK_Z_NONE;
1264
    } else {
1265
        return -ENOTSUP;
1266
    }
1267
    return 0;
1268
}
1269
#endif /* defined(CONFIG_BLKZONED) */
1270

1271
/*
1272
 * Get a sysfs attribute value as a long integer.
1273
 */
1274
#ifdef CONFIG_LINUX
1275
static long get_sysfs_long_val(struct stat *st, const char *attribute)
1276
{
1277
    g_autofree char *str = NULL;
1278
    const char *end;
1279
    long val;
1280
    int ret;
1281

1282
    ret = get_sysfs_str_val(st, attribute, &str);
1283
    if (ret < 0) {
1284
        return ret;
1285
    }
1286

1287
    /* The file is ended with '\n', pass 'end' to accept that. */
1288
    ret = qemu_strtol(str, &end, 10, &val);
1289
    if (ret == 0 && end && *end == '\0') {
1290
        ret = val;
1291
    }
1292
    return ret;
1293
}
1294
#endif
1295

1296
static int hdev_get_max_segments(int fd, struct stat *st)
1297
{
1298
#ifdef CONFIG_LINUX
1299
    int ret;
1300

1301
    if (S_ISCHR(st->st_mode)) {
1302
        if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
1303
            return ret;
1304
        }
1305
        return -ENOTSUP;
1306
    }
1307
    return get_sysfs_long_val(st, "max_segments");
1308
#else
1309
    return -ENOTSUP;
1310
#endif
1311
}
1312

1313
#if defined(CONFIG_BLKZONED)
1314
/*
1315
 * If the reset_all flag is true, then the wps of zone whose state is
1316
 * not readonly or offline should be all reset to the start sector.
1317
 * Else, take the real wp of the device.
1318
 */
1319
static int get_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
1320
                        unsigned int nrz, bool reset_all)
1321
{
1322
    struct blk_zone *blkz;
1323
    size_t rep_size;
1324
    uint64_t sector = offset >> BDRV_SECTOR_BITS;
1325
    BlockZoneWps *wps = bs->wps;
1326
    unsigned int j = offset / bs->bl.zone_size;
1327
    unsigned int n = 0, i = 0;
1328
    int ret;
1329
    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
1330
    g_autofree struct blk_zone_report *rep = NULL;
1331

1332
    rep = g_malloc(rep_size);
1333
    blkz = (struct blk_zone *)(rep + 1);
1334
    while (n < nrz) {
1335
        memset(rep, 0, rep_size);
1336
        rep->sector = sector;
1337
        rep->nr_zones = nrz - n;
1338

1339
        do {
1340
            ret = ioctl(fd, BLKREPORTZONE, rep);
1341
        } while (ret != 0 && errno == EINTR);
1342
        if (ret != 0) {
1343
            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
1344
                    fd, offset, errno);
1345
            return -errno;
1346
        }
1347

1348
        if (!rep->nr_zones) {
1349
            break;
1350
        }
1351

1352
        for (i = 0; i < rep->nr_zones; ++i, ++n, ++j) {
1353
            /*
1354
             * The wp tracking cares only about sequential writes required and
1355
             * sequential write preferred zones so that the wp can advance to
1356
             * the right location.
1357
             * Use the most significant bit of the wp location to indicate the
1358
             * zone type: 0 for SWR/SWP zones and 1 for conventional zones.
1359
             */
1360
            if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
1361
                wps->wp[j] |= 1ULL << 63;
1362
            } else {
1363
                switch(blkz[i].cond) {
1364
                case BLK_ZONE_COND_FULL:
1365
                case BLK_ZONE_COND_READONLY:
1366
                    /* Zone not writable */
1367
                    wps->wp[j] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
1368
                    break;
1369
                case BLK_ZONE_COND_OFFLINE:
1370
                    /* Zone not writable nor readable */
1371
                    wps->wp[j] = (blkz[i].start) << BDRV_SECTOR_BITS;
1372
                    break;
1373
                default:
1374
                    if (reset_all) {
1375
                        wps->wp[j] = blkz[i].start << BDRV_SECTOR_BITS;
1376
                    } else {
1377
                        wps->wp[j] = blkz[i].wp << BDRV_SECTOR_BITS;
1378
                    }
1379
                    break;
1380
                }
1381
            }
1382
        }
1383
        sector = blkz[i - 1].start + blkz[i - 1].len;
1384
    }
1385

1386
    return 0;
1387
}
1388

1389
static void update_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
1390
                            unsigned int nrz)
1391
{
1392
    if (get_zones_wp(bs, fd, offset, nrz, 0) < 0) {
1393
        error_report("update zone wp failed");
1394
    }
1395
}
1396

1397
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
1398
                                     Error **errp)
1399
{
1400
    BDRVRawState *s = bs->opaque;
1401
    BlockZoneModel zoned;
1402
    int ret;
1403

1404
    ret = get_sysfs_zoned_model(st, &zoned);
1405
    if (ret < 0 || zoned == BLK_Z_NONE) {
1406
        goto no_zoned;
1407
    }
1408
    bs->bl.zoned = zoned;
1409

1410
    ret = get_sysfs_long_val(st, "max_open_zones");
1411
    if (ret >= 0) {
1412
        bs->bl.max_open_zones = ret;
1413
    }
1414

1415
    ret = get_sysfs_long_val(st, "max_active_zones");
1416
    if (ret >= 0) {
1417
        bs->bl.max_active_zones = ret;
1418
    }
1419

1420
    /*
1421
     * The zoned device must at least have zone size and nr_zones fields.
1422
     */
1423
    ret = get_sysfs_long_val(st, "chunk_sectors");
1424
    if (ret < 0) {
1425
        error_setg_errno(errp, -ret, "Unable to read chunk_sectors "
1426
                                     "sysfs attribute");
1427
        goto no_zoned;
1428
    } else if (!ret) {
1429
        error_setg(errp, "Read 0 from chunk_sectors sysfs attribute");
1430
        goto no_zoned;
1431
    }
1432
    bs->bl.zone_size = ret << BDRV_SECTOR_BITS;
1433

1434
    ret = get_sysfs_long_val(st, "nr_zones");
1435
    if (ret < 0) {
1436
        error_setg_errno(errp, -ret, "Unable to read nr_zones "
1437
                                     "sysfs attribute");
1438
        goto no_zoned;
1439
    } else if (!ret) {
1440
        error_setg(errp, "Read 0 from nr_zones sysfs attribute");
1441
        goto no_zoned;
1442
    }
1443
    bs->bl.nr_zones = ret;
1444

1445
    ret = get_sysfs_long_val(st, "zone_append_max_bytes");
1446
    if (ret > 0) {
1447
        bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS;
1448
    }
1449

1450
    ret = get_sysfs_long_val(st, "physical_block_size");
1451
    if (ret >= 0) {
1452
        bs->bl.write_granularity = ret;
1453
    }
1454

1455
    /* The refresh_limits() function can be called multiple times. */
1456
    g_free(bs->wps);
1457
    bs->wps = g_malloc(sizeof(BlockZoneWps) +
1458
            sizeof(int64_t) * bs->bl.nr_zones);
1459
    ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 0);
1460
    if (ret < 0) {
1461
        error_setg_errno(errp, -ret, "report wps failed");
1462
        goto no_zoned;
1463
    }
1464
    qemu_co_mutex_init(&bs->wps->colock);
1465
    return;
1466

1467
no_zoned:
1468
    bs->bl.zoned = BLK_Z_NONE;
1469
    g_free(bs->wps);
1470
    bs->wps = NULL;
1471
}
1472
#else /* !defined(CONFIG_BLKZONED) */
1473
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
1474
                                     Error **errp)
1475
{
1476
    bs->bl.zoned = BLK_Z_NONE;
1477
}
1478
#endif /* !defined(CONFIG_BLKZONED) */
1479

1480
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
1481
{
1482
    BDRVRawState *s = bs->opaque;
1483
    struct stat st;
1484

1485
    s->needs_alignment = raw_needs_alignment(bs);
1486
    raw_probe_alignment(bs, s->fd, errp);
1487

1488
    bs->bl.min_mem_alignment = s->buf_align;
1489
    bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size());
1490

1491
    /*
1492
     * Maximum transfers are best effort, so it is okay to ignore any
1493
     * errors.  That said, based on the man page errors in fstat would be
1494
     * very much unexpected; the only possible case seems to be ENOMEM.
1495
     */
1496
    if (fstat(s->fd, &st)) {
1497
        return;
1498
    }
1499

1500
#if defined(__APPLE__) && (__MACH__)
1501
    struct statfs buf;
1502

1503
    if (!fstatfs(s->fd, &buf)) {
1504
        bs->bl.opt_transfer = buf.f_iosize;
1505
        bs->bl.pdiscard_alignment = buf.f_bsize;
1506
    }
1507
#endif
1508

1509
    if (bdrv_is_sg(bs) || S_ISBLK(st.st_mode)) {
1510
        int ret = hdev_get_max_hw_transfer(s->fd, &st);
1511

1512
        if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
1513
            bs->bl.max_hw_transfer = ret;
1514
        }
1515

1516
        ret = hdev_get_max_segments(s->fd, &st);
1517
        if (ret > 0) {
1518
            bs->bl.max_hw_iov = ret;
1519
        }
1520
    }
1521

1522
    raw_refresh_zoned_limits(bs, &st, errp);
1523
}
1524

1525
static int check_for_dasd(int fd)
1526
{
1527
#ifdef BIODASDINFO2
1528
    struct dasd_information2_t info = {0};
1529

1530
    return ioctl(fd, BIODASDINFO2, &info);
1531
#else
1532
    return -1;
1533
#endif
1534
}
1535

1536
/**
1537
 * Try to get @bs's logical and physical block size.
1538
 * On success, store them in @bsz and return zero.
1539
 * On failure, return negative errno.
1540
 */
1541
static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1542
{
1543
    BDRVRawState *s = bs->opaque;
1544
    int ret;
1545

1546
    /* If DASD or zoned devices, get blocksizes */
1547
    if (check_for_dasd(s->fd) < 0) {
1548
        /* zoned devices are not DASD */
1549
        if (bs->bl.zoned == BLK_Z_NONE) {
1550
            return -ENOTSUP;
1551
        }
1552
    }
1553
    ret = probe_logical_blocksize(s->fd, &bsz->log);
1554
    if (ret < 0) {
1555
        return ret;
1556
    }
1557
    return probe_physical_blocksize(s->fd, &bsz->phys);
1558
}
1559

1560
/**
1561
 * Try to get @bs's geometry: cyls, heads, sectors.
1562
 * On success, store them in @geo and return 0.
1563
 * On failure return -errno.
1564
 * (Allows block driver to assign default geometry values that guest sees)
1565
 */
1566
#ifdef __linux__
1567
static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1568
{
1569
    BDRVRawState *s = bs->opaque;
1570
    struct hd_geometry ioctl_geo = {0};
1571

1572
    /* If DASD, get its geometry */
1573
    if (check_for_dasd(s->fd) < 0) {
1574
        return -ENOTSUP;
1575
    }
1576
    if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) {
1577
        return -errno;
1578
    }
1579
    /* HDIO_GETGEO may return success even though geo contains zeros
1580
       (e.g. certain multipath setups) */
1581
    if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) {
1582
        return -ENOTSUP;
1583
    }
1584
    /* Do not return a geometry for partition */
1585
    if (ioctl_geo.start != 0) {
1586
        return -ENOTSUP;
1587
    }
1588
    geo->heads = ioctl_geo.heads;
1589
    geo->sectors = ioctl_geo.sectors;
1590
    geo->cylinders = ioctl_geo.cylinders;
1591

1592
    return 0;
1593
}
1594
#else /* __linux__ */
1595
static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
1596
{
1597
    return -ENOTSUP;
1598
}
1599
#endif
1600

1601
#if defined(__linux__)
1602
static int handle_aiocb_ioctl(void *opaque)
1603
{
1604
    RawPosixAIOData *aiocb = opaque;
1605
    int ret;
1606

1607
    ret = RETRY_ON_EINTR(
1608
        ioctl(aiocb->aio_fildes, aiocb->ioctl.cmd, aiocb->ioctl.buf)
1609
    );
1610
    if (ret == -1) {
1611
        return -errno;
1612
    }
1613

1614
    return 0;
1615
}
1616
#endif /* linux */
1617

1618
static int handle_aiocb_flush(void *opaque)
1619
{
1620
    RawPosixAIOData *aiocb = opaque;
1621
    BDRVRawState *s = aiocb->bs->opaque;
1622
    int ret;
1623

1624
    if (s->page_cache_inconsistent) {
1625
        return -s->page_cache_inconsistent;
1626
    }
1627

1628
    ret = qemu_fdatasync(aiocb->aio_fildes);
1629
    if (ret == -1) {
1630
        trace_file_flush_fdatasync_failed(errno);
1631

1632
        /* There is no clear definition of the semantics of a failing fsync(),
1633
         * so we may have to assume the worst. The sad truth is that this
1634
         * assumption is correct for Linux. Some pages are now probably marked
1635
         * clean in the page cache even though they are inconsistent with the
1636
         * on-disk contents. The next fdatasync() call would succeed, but no
1637
         * further writeback attempt will be made. We can't get back to a state
1638
         * in which we know what is on disk (we would have to rewrite
1639
         * everything that was touched since the last fdatasync() at least), so
1640
         * make bdrv_flush() fail permanently. Given that the behaviour isn't
1641
         * really defined, I have little hope that other OSes are doing better.
1642
         *
1643
         * Obviously, this doesn't affect O_DIRECT, which bypasses the page
1644
         * cache. */
1645
        if ((s->open_flags & O_DIRECT) == 0) {
1646
            s->page_cache_inconsistent = errno;
1647
        }
1648
        return -errno;
1649
    }
1650
    return 0;
1651
}
1652

1653
#ifdef CONFIG_PREADV
1654

1655
static bool preadv_present = true;
1656

1657
static ssize_t
1658
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1659
{
1660
    return preadv(fd, iov, nr_iov, offset);
1661
}
1662

1663
static ssize_t
1664
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1665
{
1666
    return pwritev(fd, iov, nr_iov, offset);
1667
}
1668

1669
#else
1670

1671
static bool preadv_present = false;
1672

1673
static ssize_t
1674
qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1675
{
1676
    return -ENOSYS;
1677
}
1678

1679
static ssize_t
1680
qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
1681
{
1682
    return -ENOSYS;
1683
}
1684

1685
#endif
1686

1687
static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
1688
{
1689
    ssize_t len;
1690

1691
    len = RETRY_ON_EINTR(
1692
        (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
1693
            qemu_pwritev(aiocb->aio_fildes,
1694
                           aiocb->io.iov,
1695
                           aiocb->io.niov,
1696
                           aiocb->aio_offset) :
1697
            qemu_preadv(aiocb->aio_fildes,
1698
                          aiocb->io.iov,
1699
                          aiocb->io.niov,
1700
                          aiocb->aio_offset)
1701
    );
1702

1703
    if (len == -1) {
1704
        return -errno;
1705
    }
1706
    return len;
1707
}
1708

1709
/*
1710
 * Read/writes the data to/from a given linear buffer.
1711
 *
1712
 * Returns the number of bytes handles or -errno in case of an error. Short
1713
 * reads are only returned if the end of the file is reached.
1714
 */
1715
static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
1716
{
1717
    ssize_t offset = 0;
1718
    ssize_t len;
1719

1720
    while (offset < aiocb->aio_nbytes) {
1721
        if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
1722
            len = pwrite(aiocb->aio_fildes,
1723
                         (const char *)buf + offset,
1724
                         aiocb->aio_nbytes - offset,
1725
                         aiocb->aio_offset + offset);
1726
        } else {
1727
            len = pread(aiocb->aio_fildes,
1728
                        buf + offset,
1729
                        aiocb->aio_nbytes - offset,
1730
                        aiocb->aio_offset + offset);
1731
        }
1732
        if (len == -1 && errno == EINTR) {
1733
            continue;
1734
        } else if (len == -1 && errno == EINVAL &&
1735
                   (aiocb->bs->open_flags & BDRV_O_NOCACHE) &&
1736
                   !(aiocb->aio_type & QEMU_AIO_WRITE) &&
1737
                   offset > 0) {
1738
            /* O_DIRECT pread() may fail with EINVAL when offset is unaligned
1739
             * after a short read.  Assume that O_DIRECT short reads only occur
1740
             * at EOF.  Therefore this is a short read, not an I/O error.
1741
             */
1742
            break;
1743
        } else if (len == -1) {
1744
            offset = -errno;
1745
            break;
1746
        } else if (len == 0) {
1747
            break;
1748
        }
1749
        offset += len;
1750
    }
1751

1752
    return offset;
1753
}
1754

1755
static int handle_aiocb_rw(void *opaque)
1756
{
1757
    RawPosixAIOData *aiocb = opaque;
1758
    ssize_t nbytes;
1759
    char *buf;
1760

1761
    if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) {
1762
        /*
1763
         * If there is just a single buffer, and it is properly aligned
1764
         * we can just use plain pread/pwrite without any problems.
1765
         */
1766
        if (aiocb->io.niov == 1) {
1767
            nbytes = handle_aiocb_rw_linear(aiocb, aiocb->io.iov->iov_base);
1768
            goto out;
1769
        }
1770
        /*
1771
         * We have more than one iovec, and all are properly aligned.
1772
         *
1773
         * Try preadv/pwritev first and fall back to linearizing the
1774
         * buffer if it's not supported.
1775
         */
1776
        if (preadv_present) {
1777
            nbytes = handle_aiocb_rw_vector(aiocb);
1778
            if (nbytes == aiocb->aio_nbytes ||
1779
                (nbytes < 0 && nbytes != -ENOSYS)) {
1780
                goto out;
1781
            }
1782
            preadv_present = false;
1783
        }
1784

1785
        /*
1786
         * XXX(hch): short read/write.  no easy way to handle the reminder
1787
         * using these interfaces.  For now retry using plain
1788
         * pread/pwrite?
1789
         */
1790
    }
1791

1792
    /*
1793
     * Ok, we have to do it the hard way, copy all segments into
1794
     * a single aligned buffer.
1795
     */
1796
    buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes);
1797
    if (buf == NULL) {
1798
        nbytes = -ENOMEM;
1799
        goto out;
1800
    }
1801

1802
    if (aiocb->aio_type & QEMU_AIO_WRITE) {
1803
        char *p = buf;
1804
        int i;
1805

1806
        for (i = 0; i < aiocb->io.niov; ++i) {
1807
            memcpy(p, aiocb->io.iov[i].iov_base, aiocb->io.iov[i].iov_len);
1808
            p += aiocb->io.iov[i].iov_len;
1809
        }
1810
        assert(p - buf == aiocb->aio_nbytes);
1811
    }
1812

1813
    nbytes = handle_aiocb_rw_linear(aiocb, buf);
1814
    if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
1815
        char *p = buf;
1816
        size_t count = aiocb->aio_nbytes, copy;
1817
        int i;
1818

1819
        for (i = 0; i < aiocb->io.niov && count; ++i) {
1820
            copy = count;
1821
            if (copy > aiocb->io.iov[i].iov_len) {
1822
                copy = aiocb->io.iov[i].iov_len;
1823
            }
1824
            memcpy(aiocb->io.iov[i].iov_base, p, copy);
1825
            assert(count >= copy);
1826
            p     += copy;
1827
            count -= copy;
1828
        }
1829
        assert(count == 0);
1830
    }
1831
    qemu_vfree(buf);
1832

1833
out:
1834
    if (nbytes == aiocb->aio_nbytes) {
1835
        return 0;
1836
    } else if (nbytes >= 0 && nbytes < aiocb->aio_nbytes) {
1837
        if (aiocb->aio_type & QEMU_AIO_WRITE) {
1838
            return -EINVAL;
1839
        } else {
1840
            iov_memset(aiocb->io.iov, aiocb->io.niov, nbytes,
1841
                      0, aiocb->aio_nbytes - nbytes);
1842
            return 0;
1843
        }
1844
    } else {
1845
        assert(nbytes < 0);
1846
        return nbytes;
1847
    }
1848
}
1849

1850
#if defined(CONFIG_FALLOCATE) || defined(BLKZEROOUT) || defined(BLKDISCARD)
1851
static int translate_err(int err)
1852
{
1853
    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
1854
        err == -ENOTTY) {
1855
        err = -ENOTSUP;
1856
    }
1857
    return err;
1858
}
1859
#endif
1860

1861
#ifdef CONFIG_FALLOCATE
1862
static int do_fallocate(int fd, int mode, off_t offset, off_t len)
1863
{
1864
    do {
1865
        if (fallocate(fd, mode, offset, len) == 0) {
1866
            return 0;
1867
        }
1868
    } while (errno == EINTR);
1869
    return translate_err(-errno);
1870
}
1871
#endif
1872

1873
static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
1874
{
1875
    int ret = -ENOTSUP;
1876
    BDRVRawState *s = aiocb->bs->opaque;
1877

1878
    if (!s->has_write_zeroes) {
1879
        return -ENOTSUP;
1880
    }
1881

1882
#ifdef BLKZEROOUT
1883
    /* The BLKZEROOUT implementation in the kernel doesn't set
1884
     * BLKDEV_ZERO_NOFALLBACK, so we can't call this if we have to avoid slow
1885
     * fallbacks. */
1886
    if (!(aiocb->aio_type & QEMU_AIO_NO_FALLBACK)) {
1887
        do {
1888
            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
1889
            if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
1890
                return 0;
1891
            }
1892
        } while (errno == EINTR);
1893

1894
        ret = translate_err(-errno);
1895
        if (ret == -ENOTSUP) {
1896
            s->has_write_zeroes = false;
1897
        }
1898
    }
1899
#endif
1900

1901
    return ret;
1902
}
1903

1904
static int handle_aiocb_write_zeroes(void *opaque)
1905
{
1906
    RawPosixAIOData *aiocb = opaque;
1907
#ifdef CONFIG_FALLOCATE
1908
    BDRVRawState *s = aiocb->bs->opaque;
1909
    int64_t len;
1910
#endif
1911

1912
    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
1913
        return handle_aiocb_write_zeroes_block(aiocb);
1914
    }
1915

1916
#ifdef CONFIG_FALLOCATE_ZERO_RANGE
1917
    if (s->has_write_zeroes) {
1918
        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
1919
                               aiocb->aio_offset, aiocb->aio_nbytes);
1920
        if (ret == -ENOTSUP) {
1921
            s->has_write_zeroes = false;
1922
        } else if (ret == 0 || ret != -EINVAL) {
1923
            return ret;
1924
        }
1925
        /*
1926
         * Note: Some file systems do not like unaligned byte ranges, and
1927
         * return EINVAL in such a case, though they should not do it according
1928
         * to the man-page of fallocate(). Thus we simply ignore this return
1929
         * value and try the other fallbacks instead.
1930
         */
1931
    }
1932
#endif
1933

1934
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1935
    if (s->has_discard && s->has_fallocate) {
1936
        int ret = do_fallocate(s->fd,
1937
                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1938
                               aiocb->aio_offset, aiocb->aio_nbytes);
1939
        if (ret == 0) {
1940
            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1941
            if (ret == 0 || ret != -ENOTSUP) {
1942
                return ret;
1943
            }
1944
            s->has_fallocate = false;
1945
        } else if (ret == -EINVAL) {
1946
            /*
1947
             * Some file systems like older versions of GPFS do not like un-
1948
             * aligned byte ranges, and return EINVAL in such a case, though
1949
             * they should not do it according to the man-page of fallocate().
1950
             * Warn about the bad filesystem and try the final fallback instead.
1951
             */
1952
            warn_report_once("Your file system is misbehaving: "
1953
                             "fallocate(FALLOC_FL_PUNCH_HOLE) returned EINVAL. "
1954
                             "Please report this bug to your file system "
1955
                             "vendor.");
1956
        } else if (ret != -ENOTSUP) {
1957
            return ret;
1958
        } else {
1959
            s->has_discard = false;
1960
        }
1961
    }
1962
#endif
1963

1964
#ifdef CONFIG_FALLOCATE
1965
    /* Last resort: we are trying to extend the file with zeroed data. This
1966
     * can be done via fallocate(fd, 0) */
1967
    len = raw_getlength(aiocb->bs);
1968
    if (s->has_fallocate && len >= 0 && aiocb->aio_offset >= len) {
1969
        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
1970
        if (ret == 0 || ret != -ENOTSUP) {
1971
            return ret;
1972
        }
1973
        s->has_fallocate = false;
1974
    }
1975
#endif
1976

1977
    return -ENOTSUP;
1978
}
1979

1980
static int handle_aiocb_write_zeroes_unmap(void *opaque)
1981
{
1982
    RawPosixAIOData *aiocb = opaque;
1983
    BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
1984

1985
    /* First try to write zeros and unmap at the same time */
1986

1987
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
1988
    int ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1989
                           aiocb->aio_offset, aiocb->aio_nbytes);
1990
    switch (ret) {
1991
    case -ENOTSUP:
1992
    case -EINVAL:
1993
    case -EBUSY:
1994
        break;
1995
    default:
1996
        return ret;
1997
    }
1998
#endif
1999

2000
    /* If we couldn't manage to unmap while guaranteed that the area reads as
2001
     * all-zero afterwards, just write zeroes without unmapping */
2002
    return handle_aiocb_write_zeroes(aiocb);
2003
}
2004

2005
#ifndef HAVE_COPY_FILE_RANGE
2006
static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
2007
                             off_t *out_off, size_t len, unsigned int flags)
2008
{
2009
#ifdef __NR_copy_file_range
2010
    return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
2011
                   out_off, len, flags);
2012
#else
2013
    errno = ENOSYS;
2014
    return -1;
2015
#endif
2016
}
2017
#endif
2018

2019
/*
2020
 * parse_zone - Fill a zone descriptor
2021
 */
2022
#if defined(CONFIG_BLKZONED)
2023
static inline int parse_zone(struct BlockZoneDescriptor *zone,
2024
                              const struct blk_zone *blkz) {
2025
    zone->start = blkz->start << BDRV_SECTOR_BITS;
2026
    zone->length = blkz->len << BDRV_SECTOR_BITS;
2027
    zone->wp = blkz->wp << BDRV_SECTOR_BITS;
2028

2029
#ifdef HAVE_BLK_ZONE_REP_CAPACITY
2030
    zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
2031
#else
2032
    zone->cap = blkz->len << BDRV_SECTOR_BITS;
2033
#endif
2034

2035
    switch (blkz->type) {
2036
    case BLK_ZONE_TYPE_SEQWRITE_REQ:
2037
        zone->type = BLK_ZT_SWR;
2038
        break;
2039
    case BLK_ZONE_TYPE_SEQWRITE_PREF:
2040
        zone->type = BLK_ZT_SWP;
2041
        break;
2042
    case BLK_ZONE_TYPE_CONVENTIONAL:
2043
        zone->type = BLK_ZT_CONV;
2044
        break;
2045
    default:
2046
        error_report("Unsupported zone type: 0x%x", blkz->type);
2047
        return -ENOTSUP;
2048
    }
2049

2050
    switch (blkz->cond) {
2051
    case BLK_ZONE_COND_NOT_WP:
2052
        zone->state = BLK_ZS_NOT_WP;
2053
        break;
2054
    case BLK_ZONE_COND_EMPTY:
2055
        zone->state = BLK_ZS_EMPTY;
2056
        break;
2057
    case BLK_ZONE_COND_IMP_OPEN:
2058
        zone->state = BLK_ZS_IOPEN;
2059
        break;
2060
    case BLK_ZONE_COND_EXP_OPEN:
2061
        zone->state = BLK_ZS_EOPEN;
2062
        break;
2063
    case BLK_ZONE_COND_CLOSED:
2064
        zone->state = BLK_ZS_CLOSED;
2065
        break;
2066
    case BLK_ZONE_COND_READONLY:
2067
        zone->state = BLK_ZS_RDONLY;
2068
        break;
2069
    case BLK_ZONE_COND_FULL:
2070
        zone->state = BLK_ZS_FULL;
2071
        break;
2072
    case BLK_ZONE_COND_OFFLINE:
2073
        zone->state = BLK_ZS_OFFLINE;
2074
        break;
2075
    default:
2076
        error_report("Unsupported zone state: 0x%x", blkz->cond);
2077
        return -ENOTSUP;
2078
    }
2079
    return 0;
2080
}
2081
#endif
2082

2083
#if defined(CONFIG_BLKZONED)
2084
static int handle_aiocb_zone_report(void *opaque)
2085
{
2086
    RawPosixAIOData *aiocb = opaque;
2087
    int fd = aiocb->aio_fildes;
2088
    unsigned int *nr_zones = aiocb->zone_report.nr_zones;
2089
    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
2090
    /* zoned block devices use 512-byte sectors */
2091
    uint64_t sector = aiocb->aio_offset / 512;
2092

2093
    struct blk_zone *blkz;
2094
    size_t rep_size;
2095
    unsigned int nrz;
2096
    int ret;
2097
    unsigned int n = 0, i = 0;
2098

2099
    nrz = *nr_zones;
2100
    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
2101
    g_autofree struct blk_zone_report *rep = NULL;
2102
    rep = g_malloc(rep_size);
2103

2104
    blkz = (struct blk_zone *)(rep + 1);
2105
    while (n < nrz) {
2106
        memset(rep, 0, rep_size);
2107
        rep->sector = sector;
2108
        rep->nr_zones = nrz - n;
2109

2110
        do {
2111
            ret = ioctl(fd, BLKREPORTZONE, rep);
2112
        } while (ret != 0 && errno == EINTR);
2113
        if (ret != 0) {
2114
            error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
2115
                         fd, sector, errno);
2116
            return -errno;
2117
        }
2118

2119
        if (!rep->nr_zones) {
2120
            break;
2121
        }
2122

2123
        for (i = 0; i < rep->nr_zones; i++, n++) {
2124
            ret = parse_zone(&zones[n], &blkz[i]);
2125
            if (ret != 0) {
2126
                return ret;
2127
            }
2128

2129
            /* The next report should start after the last zone reported */
2130
            sector = blkz[i].start + blkz[i].len;
2131
        }
2132
    }
2133

2134
    *nr_zones = n;
2135
    return 0;
2136
}
2137
#endif
2138

2139
#if defined(CONFIG_BLKZONED)
2140
static int handle_aiocb_zone_mgmt(void *opaque)
2141
{
2142
    RawPosixAIOData *aiocb = opaque;
2143
    int fd = aiocb->aio_fildes;
2144
    uint64_t sector = aiocb->aio_offset / 512;
2145
    int64_t nr_sectors = aiocb->aio_nbytes / 512;
2146
    struct blk_zone_range range;
2147
    int ret;
2148

2149
    /* Execute the operation */
2150
    range.sector = sector;
2151
    range.nr_sectors = nr_sectors;
2152
    do {
2153
        ret = ioctl(fd, aiocb->zone_mgmt.op, &range);
2154
    } while (ret != 0 && errno == EINTR);
2155

2156
    return ret < 0 ? -errno : ret;
2157
}
2158
#endif
2159

2160
static int handle_aiocb_copy_range(void *opaque)
2161
{
2162
    RawPosixAIOData *aiocb = opaque;
2163
    uint64_t bytes = aiocb->aio_nbytes;
2164
    off_t in_off = aiocb->aio_offset;
2165
    off_t out_off = aiocb->copy_range.aio_offset2;
2166

2167
    while (bytes) {
2168
        ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
2169
                                      aiocb->copy_range.aio_fd2, &out_off,
2170
                                      bytes, 0);
2171
        trace_file_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
2172
                                   aiocb->copy_range.aio_fd2, out_off, bytes,
2173
                                   0, ret);
2174
        if (ret == 0) {
2175
            /* No progress (e.g. when beyond EOF), let the caller fall back to
2176
             * buffer I/O. */
2177
            return -ENOSPC;
2178
        }
2179
        if (ret < 0) {
2180
            switch (errno) {
2181
            case ENOSYS:
2182
                return -ENOTSUP;
2183
            case EINTR:
2184
                continue;
2185
            default:
2186
                return -errno;
2187
            }
2188
        }
2189
        bytes -= ret;
2190
    }
2191
    return 0;
2192
}
2193

2194
static int handle_aiocb_discard(void *opaque)
2195
{
2196
    RawPosixAIOData *aiocb = opaque;
2197
    int ret = -ENOTSUP;
2198
    BDRVRawState *s = aiocb->bs->opaque;
2199

2200
    if (!s->has_discard) {
2201
        return -ENOTSUP;
2202
    }
2203

2204
    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
2205
#ifdef BLKDISCARD
2206
        do {
2207
            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
2208
            if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) {
2209
                return 0;
2210
            }
2211
        } while (errno == EINTR);
2212

2213
        ret = translate_err(-errno);
2214
#endif
2215
    } else {
2216
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
2217
        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
2218
                           aiocb->aio_offset, aiocb->aio_nbytes);
2219
        ret = translate_err(ret);
2220
#elif defined(__APPLE__) && (__MACH__)
2221
        fpunchhole_t fpunchhole;
2222
        fpunchhole.fp_flags = 0;
2223
        fpunchhole.reserved = 0;
2224
        fpunchhole.fp_offset = aiocb->aio_offset;
2225
        fpunchhole.fp_length = aiocb->aio_nbytes;
2226
        if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) {
2227
            ret = errno == ENODEV ? -ENOTSUP : -errno;
2228
        } else {
2229
            ret = 0;
2230
        }
2231
#endif
2232
    }
2233

2234
    if (ret == -ENOTSUP) {
2235
        s->has_discard = false;
2236
    }
2237
    return ret;
2238
}
2239

2240
/*
2241
 * Help alignment probing by allocating the first block.
2242
 *
2243
 * When reading with direct I/O from unallocated area on Gluster backed by XFS,
2244
 * reading succeeds regardless of request length. In this case we fallback to
2245
 * safe alignment which is not optimal. Allocating the first block avoids this
2246
 * fallback.
2247
 *
2248
 * fd may be opened with O_DIRECT, but we don't know the buffer alignment or
2249
 * request alignment, so we use safe values.
2250
 *
2251
 * Returns: 0 on success, -errno on failure. Since this is an optimization,
2252
 * caller may ignore failures.
2253
 */
2254
static int allocate_first_block(int fd, size_t max_size)
2255
{
2256
    size_t write_size = (max_size < MAX_BLOCKSIZE)
2257
        ? BDRV_SECTOR_SIZE
2258
        : MAX_BLOCKSIZE;
2259
    size_t max_align = MAX(MAX_BLOCKSIZE, qemu_real_host_page_size());
2260
    void *buf;
2261
    ssize_t n;
2262
    int ret;
2263

2264
    buf = qemu_memalign(max_align, write_size);
2265
    memset(buf, 0, write_size);
2266

2267
    n = RETRY_ON_EINTR(pwrite(fd, buf, write_size, 0));
2268

2269
    ret = (n == -1) ? -errno : 0;
2270

2271
    qemu_vfree(buf);
2272
    return ret;
2273
}
2274

2275
static int handle_aiocb_truncate(void *opaque)
2276
{
2277
    RawPosixAIOData *aiocb = opaque;
2278
    int result = 0;
2279
    int64_t current_length = 0;
2280
    char *buf = NULL;
2281
    struct stat st;
2282
    int fd = aiocb->aio_fildes;
2283
    int64_t offset = aiocb->aio_offset;
2284
    PreallocMode prealloc = aiocb->truncate.prealloc;
2285
    Error **errp = aiocb->truncate.errp;
2286

2287
    if (fstat(fd, &st) < 0) {
2288
        result = -errno;
2289
        error_setg_errno(errp, -result, "Could not stat file");
2290
        return result;
2291
    }
2292

2293
    current_length = st.st_size;
2294
    if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
2295
        error_setg(errp, "Cannot use preallocation for shrinking files");
2296
        return -ENOTSUP;
2297
    }
2298

2299
    switch (prealloc) {
2300
#ifdef CONFIG_POSIX_FALLOCATE
2301
    case PREALLOC_MODE_FALLOC:
2302
        /*
2303
         * Truncating before posix_fallocate() makes it about twice slower on
2304
         * file systems that do not support fallocate(), trying to check if a
2305
         * block is allocated before allocating it, so don't do that here.
2306
         */
2307
        if (offset != current_length) {
2308
            result = -posix_fallocate(fd, current_length,
2309
                                      offset - current_length);
2310
            if (result != 0) {
2311
                /* posix_fallocate() doesn't set errno. */
2312
                error_setg_errno(errp, -result,
2313
                                 "Could not preallocate new data");
2314
            } else if (current_length == 0) {
2315
                /*
2316
                 * posix_fallocate() uses fallocate() if the filesystem
2317
                 * supports it, or fallback to manually writing zeroes. If
2318
                 * fallocate() was used, unaligned reads from the fallocated
2319
                 * area in raw_probe_alignment() will succeed, hence we need to
2320
                 * allocate the first block.
2321
                 *
2322
                 * Optimize future alignment probing; ignore failures.
2323
                 */
2324
                allocate_first_block(fd, offset);
2325
            }
2326
        } else {
2327
            result = 0;
2328
        }
2329
        goto out;
2330
#endif
2331
    case PREALLOC_MODE_FULL:
2332
    {
2333
        int64_t num = 0, left = offset - current_length;
2334
        off_t seek_result;
2335

2336
        /*
2337
         * Knowing the final size from the beginning could allow the file
2338
         * system driver to do less allocations and possibly avoid
2339
         * fragmentation of the file.
2340
         */
2341
        if (ftruncate(fd, offset) != 0) {
2342
            result = -errno;
2343
            error_setg_errno(errp, -result, "Could not resize file");
2344
            goto out;
2345
        }
2346

2347
        buf = g_malloc0(65536);
2348

2349
        seek_result = lseek(fd, current_length, SEEK_SET);
2350
        if (seek_result < 0) {
2351
            result = -errno;
2352
            error_setg_errno(errp, -result,
2353
                             "Failed to seek to the old end of file");
2354
            goto out;
2355
        }
2356

2357
        while (left > 0) {
2358
            num = MIN(left, 65536);
2359
            result = write(fd, buf, num);
2360
            if (result < 0) {
2361
                if (errno == EINTR) {
2362
                    continue;
2363
                }
2364
                result = -errno;
2365
                error_setg_errno(errp, -result,
2366
                                 "Could not write zeros for preallocation");
2367
                goto out;
2368
            }
2369
            left -= result;
2370
        }
2371
        if (result >= 0) {
2372
            result = fsync(fd);
2373
            if (result < 0) {
2374
                result = -errno;
2375
                error_setg_errno(errp, -result,
2376
                                 "Could not flush file to disk");
2377
                goto out;
2378
            }
2379
        }
2380
        goto out;
2381
    }
2382
    case PREALLOC_MODE_OFF:
2383
        if (ftruncate(fd, offset) != 0) {
2384
            result = -errno;
2385
            error_setg_errno(errp, -result, "Could not resize file");
2386
        } else if (current_length == 0 && offset > current_length) {
2387
            /* Optimize future alignment probing; ignore failures. */
2388
            allocate_first_block(fd, offset);
2389
        }
2390
        return result;
2391
    default:
2392
        result = -ENOTSUP;
2393
        error_setg(errp, "Unsupported preallocation mode: %s",
2394
                   PreallocMode_str(prealloc));
2395
        return result;
2396
    }
2397

2398
out:
2399
    if (result < 0) {
2400
        if (ftruncate(fd, current_length) < 0) {
2401
            error_report("Failed to restore old file length: %s",
2402
                         strerror(errno));
2403
        }
2404
    }
2405

2406
    g_free(buf);
2407
    return result;
2408
}
2409

2410
static int coroutine_fn raw_thread_pool_submit(ThreadPoolFunc func, void *arg)
2411
{
2412
    return thread_pool_submit_co(func, arg);
2413
}
2414

2415
/*
2416
 * Check if all memory in this vector is sector aligned.
2417
 */
2418
static bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2419
{
2420
    int i;
2421
    size_t alignment = bdrv_min_mem_align(bs);
2422
    size_t len = bs->bl.request_alignment;
2423
    IO_CODE();
2424

2425
    for (i = 0; i < qiov->niov; i++) {
2426
        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2427
            return false;
2428
        }
2429
        if (qiov->iov[i].iov_len % len) {
2430
            return false;
2431
        }
2432
    }
2433

2434
    return true;
2435
}
2436

2437
#ifdef CONFIG_LINUX_IO_URING
2438
static inline bool raw_check_linux_io_uring(BDRVRawState *s)
2439
{
2440
    Error *local_err = NULL;
2441
    AioContext *ctx;
2442

2443
    if (!s->use_linux_io_uring) {
2444
        return false;
2445
    }
2446

2447
    ctx = qemu_get_current_aio_context();
2448
    if (unlikely(!aio_setup_linux_io_uring(ctx, &local_err))) {
2449
        error_reportf_err(local_err, "Unable to use linux io_uring, "
2450
                                     "falling back to thread pool: ");
2451
        s->use_linux_io_uring = false;
2452
        return false;
2453
    }
2454
    return true;
2455
}
2456
#endif
2457

2458
#ifdef CONFIG_LINUX_AIO
2459
static inline bool raw_check_linux_aio(BDRVRawState *s)
2460
{
2461
    Error *local_err = NULL;
2462
    AioContext *ctx;
2463

2464
    if (!s->use_linux_aio) {
2465
        return false;
2466
    }
2467

2468
    ctx = qemu_get_current_aio_context();
2469
    if (unlikely(!aio_setup_linux_aio(ctx, &local_err))) {
2470
        error_reportf_err(local_err, "Unable to use Linux AIO, "
2471
                                     "falling back to thread pool: ");
2472
        s->use_linux_aio = false;
2473
        return false;
2474
    }
2475
    return true;
2476
}
2477
#endif
2478

2479
static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
2480
                                   uint64_t bytes, QEMUIOVector *qiov, int type)
2481
{
2482
    BDRVRawState *s = bs->opaque;
2483
    RawPosixAIOData acb;
2484
    int ret;
2485
    uint64_t offset = *offset_ptr;
2486

2487
    if (fd_open(bs) < 0)
2488
        return -EIO;
2489
#if defined(CONFIG_BLKZONED)
2490
    if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) &&
2491
        bs->bl.zoned != BLK_Z_NONE) {
2492
        qemu_co_mutex_lock(&bs->wps->colock);
2493
        if (type & QEMU_AIO_ZONE_APPEND) {
2494
            int index = offset / bs->bl.zone_size;
2495
            offset = bs->wps->wp[index];
2496
        }
2497
    }
2498
#endif
2499

2500
    /*
2501
     * When using O_DIRECT, the request must be aligned to be able to use
2502
     * either libaio or io_uring interface. If not fail back to regular thread
2503
     * pool read/write code which emulates this for us if we
2504
     * set QEMU_AIO_MISALIGNED.
2505
     */
2506
    if (s->needs_alignment && !bdrv_qiov_is_aligned(bs, qiov)) {
2507
        type |= QEMU_AIO_MISALIGNED;
2508
#ifdef CONFIG_LINUX_IO_URING
2509
    } else if (raw_check_linux_io_uring(s)) {
2510
        assert(qiov->size == bytes);
2511
        ret = luring_co_submit(bs, s->fd, offset, qiov, type);
2512
        goto out;
2513
#endif
2514
#ifdef CONFIG_LINUX_AIO
2515
    } else if (raw_check_linux_aio(s)) {
2516
        assert(qiov->size == bytes);
2517
        ret = laio_co_submit(s->fd, offset, qiov, type,
2518
                              s->aio_max_batch);
2519
        goto out;
2520
#endif
2521
    }
2522

2523
    acb = (RawPosixAIOData) {
2524
        .bs             = bs,
2525
        .aio_fildes     = s->fd,
2526
        .aio_type       = type,
2527
        .aio_offset     = offset,
2528
        .aio_nbytes     = bytes,
2529
        .io             = {
2530
            .iov            = qiov->iov,
2531
            .niov           = qiov->niov,
2532
        },
2533
    };
2534

2535
    assert(qiov->size == bytes);
2536
    ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
2537
    goto out; /* Avoid the compiler err of unused label */
2538

2539
out:
2540
#if defined(CONFIG_BLKZONED)
2541
    if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) &&
2542
        bs->bl.zoned != BLK_Z_NONE) {
2543
        BlockZoneWps *wps = bs->wps;
2544
        if (ret == 0) {
2545
            uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
2546
            if (!BDRV_ZT_IS_CONV(*wp)) {
2547
                if (type & QEMU_AIO_ZONE_APPEND) {
2548
                    *offset_ptr = *wp;
2549
                    trace_zbd_zone_append_complete(bs, *offset_ptr
2550
                        >> BDRV_SECTOR_BITS);
2551
                }
2552
                /* Advance the wp if needed */
2553
                if (offset + bytes > *wp) {
2554
                    *wp = offset + bytes;
2555
                }
2556
            }
2557
        } else {
2558
            /*
2559
             * write and append write are not allowed to cross zone boundaries
2560
             */
2561
            update_zones_wp(bs, s->fd, offset, 1);
2562
        }
2563

2564
        qemu_co_mutex_unlock(&wps->colock);
2565
    }
2566
#endif
2567
    return ret;
2568
}
2569

2570
static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
2571
                                      int64_t bytes, QEMUIOVector *qiov,
2572
                                      BdrvRequestFlags flags)
2573
{
2574
    return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_READ);
2575
}
2576

2577
static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
2578
                                       int64_t bytes, QEMUIOVector *qiov,
2579
                                       BdrvRequestFlags flags)
2580
{
2581
    return raw_co_prw(bs, &offset, bytes, qiov, QEMU_AIO_WRITE);
2582
}
2583

2584
static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
2585
{
2586
    BDRVRawState *s = bs->opaque;
2587
    RawPosixAIOData acb;
2588
    int ret;
2589

2590
    ret = fd_open(bs);
2591
    if (ret < 0) {
2592
        return ret;
2593
    }
2594

2595
    acb = (RawPosixAIOData) {
2596
        .bs             = bs,
2597
        .aio_fildes     = s->fd,
2598
        .aio_type       = QEMU_AIO_FLUSH,
2599
    };
2600

2601
#ifdef CONFIG_LINUX_IO_URING
2602
    if (raw_check_linux_io_uring(s)) {
2603
        return luring_co_submit(bs, s->fd, 0, NULL, QEMU_AIO_FLUSH);
2604
    }
2605
#endif
2606
#ifdef CONFIG_LINUX_AIO
2607
    if (s->has_laio_fdsync && raw_check_linux_aio(s)) {
2608
        return laio_co_submit(s->fd, 0, NULL, QEMU_AIO_FLUSH, 0);
2609
    }
2610
#endif
2611
    return raw_thread_pool_submit(handle_aiocb_flush, &acb);
2612
}
2613

2614
static void raw_close(BlockDriverState *bs)
2615
{
2616
    BDRVRawState *s = bs->opaque;
2617

2618
    if (s->fd >= 0) {
2619
#if defined(CONFIG_BLKZONED)
2620
        g_free(bs->wps);
2621
#endif
2622
        qemu_close(s->fd);
2623
        s->fd = -1;
2624
    }
2625
}
2626

2627
/**
2628
 * Truncates the given regular file @fd to @offset and, when growing, fills the
2629
 * new space according to @prealloc.
2630
 *
2631
 * Returns: 0 on success, -errno on failure.
2632
 */
2633
static int coroutine_fn
2634
raw_regular_truncate(BlockDriverState *bs, int fd, int64_t offset,
2635
                     PreallocMode prealloc, Error **errp)
2636
{
2637
    RawPosixAIOData acb;
2638

2639
    acb = (RawPosixAIOData) {
2640
        .bs             = bs,
2641
        .aio_fildes     = fd,
2642
        .aio_type       = QEMU_AIO_TRUNCATE,
2643
        .aio_offset     = offset,
2644
        .truncate       = {
2645
            .prealloc       = prealloc,
2646
            .errp           = errp,
2647
        },
2648
    };
2649

2650
    return raw_thread_pool_submit(handle_aiocb_truncate, &acb);
2651
}
2652

2653
static int coroutine_fn raw_co_truncate(BlockDriverState *bs, int64_t offset,
2654
                                        bool exact, PreallocMode prealloc,
2655
                                        BdrvRequestFlags flags, Error **errp)
2656
{
2657
    BDRVRawState *s = bs->opaque;
2658
    struct stat st;
2659
    int ret;
2660

2661
    if (fstat(s->fd, &st)) {
2662
        ret = -errno;
2663
        error_setg_errno(errp, -ret, "Failed to fstat() the file");
2664
        return ret;
2665
    }
2666

2667
    if (S_ISREG(st.st_mode)) {
2668
        /* Always resizes to the exact @offset */
2669
        return raw_regular_truncate(bs, s->fd, offset, prealloc, errp);
2670
    }
2671

2672
    if (prealloc != PREALLOC_MODE_OFF) {
2673
        error_setg(errp, "Preallocation mode '%s' unsupported for this "
2674
                   "non-regular file", PreallocMode_str(prealloc));
2675
        return -ENOTSUP;
2676
    }
2677

2678
    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2679
        int64_t cur_length = raw_getlength(bs);
2680

2681
        if (offset != cur_length && exact) {
2682
            error_setg(errp, "Cannot resize device files");
2683
            return -ENOTSUP;
2684
        } else if (offset > cur_length) {
2685
            error_setg(errp, "Cannot grow device files");
2686
            return -EINVAL;
2687
        }
2688
    } else {
2689
        error_setg(errp, "Resizing this file is not supported");
2690
        return -ENOTSUP;
2691
    }
2692

2693
    return 0;
2694
}
2695

2696
#ifdef __OpenBSD__
2697
static int64_t raw_getlength(BlockDriverState *bs)
2698
{
2699
    BDRVRawState *s = bs->opaque;
2700
    int fd = s->fd;
2701
    struct stat st;
2702

2703
    if (fstat(fd, &st))
2704
        return -errno;
2705
    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2706
        struct disklabel dl;
2707

2708
        if (ioctl(fd, DIOCGDINFO, &dl))
2709
            return -errno;
2710
        return (uint64_t)dl.d_secsize *
2711
            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2712
    } else
2713
        return st.st_size;
2714
}
2715
#elif defined(__NetBSD__)
2716
static int64_t raw_getlength(BlockDriverState *bs)
2717
{
2718
    BDRVRawState *s = bs->opaque;
2719
    int fd = s->fd;
2720
    struct stat st;
2721

2722
    if (fstat(fd, &st))
2723
        return -errno;
2724
    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
2725
        struct dkwedge_info dkw;
2726

2727
        if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) {
2728
            return dkw.dkw_size * 512;
2729
        } else {
2730
            struct disklabel dl;
2731

2732
            if (ioctl(fd, DIOCGDINFO, &dl))
2733
                return -errno;
2734
            return (uint64_t)dl.d_secsize *
2735
                dl.d_partitions[DISKPART(st.st_rdev)].p_size;
2736
        }
2737
    } else
2738
        return st.st_size;
2739
}
2740
#elif defined(__sun__)
2741
static int64_t raw_getlength(BlockDriverState *bs)
2742
{
2743
    BDRVRawState *s = bs->opaque;
2744
    struct dk_minfo minfo;
2745
    int ret;
2746
    int64_t size;
2747

2748
    ret = fd_open(bs);
2749
    if (ret < 0) {
2750
        return ret;
2751
    }
2752

2753
    /*
2754
     * Use the DKIOCGMEDIAINFO ioctl to read the size.
2755
     */
2756
    ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo);
2757
    if (ret != -1) {
2758
        return minfo.dki_lbsize * minfo.dki_capacity;
2759
    }
2760

2761
    /*
2762
     * There are reports that lseek on some devices fails, but
2763
     * irc discussion said that contingency on contingency was overkill.
2764
     */
2765
    size = lseek(s->fd, 0, SEEK_END);
2766
    if (size < 0) {
2767
        return -errno;
2768
    }
2769
    return size;
2770
}
2771
#elif defined(CONFIG_BSD)
2772
static int64_t raw_getlength(BlockDriverState *bs)
2773
{
2774
    BDRVRawState *s = bs->opaque;
2775
    int fd = s->fd;
2776
    int64_t size;
2777
    struct stat sb;
2778
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2779
    int reopened = 0;
2780
#endif
2781
    int ret;
2782

2783
    ret = fd_open(bs);
2784
    if (ret < 0)
2785
        return ret;
2786

2787
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
2788
again:
2789
#endif
2790
    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
2791
        size = 0;
2792
#ifdef DIOCGMEDIASIZE
2793
        if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) {
2794
            size = 0;
2795
        }
2796
#endif
2797
#ifdef DIOCGPART
2798
        if (size == 0) {
2799
            struct partinfo pi;
2800
            if (ioctl(fd, DIOCGPART, &pi) == 0) {
2801
                size = pi.media_size;
2802
            }
2803
        }
2804
#endif
2805
#if defined(DKIOCGETBLOCKCOUNT) && defined(DKIOCGETBLOCKSIZE)
2806
        if (size == 0) {
2807
            uint64_t sectors = 0;
2808
            uint32_t sector_size = 0;
2809

2810
            if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
2811
               && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
2812
                size = sectors * sector_size;
2813
            }
2814
        }
2815
#endif
2816
        if (size == 0) {
2817
            size = lseek(fd, 0LL, SEEK_END);
2818
        }
2819
        if (size < 0) {
2820
            return -errno;
2821
        }
2822
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
2823
        switch(s->type) {
2824
        case FTYPE_CD:
2825
            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
2826
            if (size == 2048LL * (unsigned)-1)
2827
                size = 0;
2828
            /* XXX no disc?  maybe we need to reopen... */
2829
            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
2830
                reopened = 1;
2831
                goto again;
2832
            }
2833
        }
2834
#endif
2835
    } else {
2836
        size = lseek(fd, 0, SEEK_END);
2837
        if (size < 0) {
2838
            return -errno;
2839
        }
2840
    }
2841
    return size;
2842
}
2843
#else
2844
static int64_t raw_getlength(BlockDriverState *bs)
2845
{
2846
    BDRVRawState *s = bs->opaque;
2847
    int ret;
2848
    int64_t size;
2849

2850
    ret = fd_open(bs);
2851
    if (ret < 0) {
2852
        return ret;
2853
    }
2854

2855
    size = lseek(s->fd, 0, SEEK_END);
2856
    if (size < 0) {
2857
        return -errno;
2858
    }
2859
    return size;
2860
}
2861
#endif
2862

2863
static int64_t coroutine_fn raw_co_getlength(BlockDriverState *bs)
2864
{
2865
    return raw_getlength(bs);
2866
}
2867

2868
static int64_t coroutine_fn raw_co_get_allocated_file_size(BlockDriverState *bs)
2869
{
2870
    struct stat st;
2871
    BDRVRawState *s = bs->opaque;
2872

2873
    if (fstat(s->fd, &st) < 0) {
2874
        return -errno;
2875
    }
2876
    return (int64_t)st.st_blocks * 512;
2877
}
2878

2879
static int coroutine_fn
2880
raw_co_create(BlockdevCreateOptions *options, Error **errp)
2881
{
2882
    BlockdevCreateOptionsFile *file_opts;
2883
    Error *local_err = NULL;
2884
    int fd;
2885
    uint64_t perm, shared;
2886
    int result = 0;
2887

2888
    /* Validate options and set default values */
2889
    assert(options->driver == BLOCKDEV_DRIVER_FILE);
2890
    file_opts = &options->u.file;
2891

2892
    if (!file_opts->has_nocow) {
2893
        file_opts->nocow = false;
2894
    }
2895
    if (!file_opts->has_preallocation) {
2896
        file_opts->preallocation = PREALLOC_MODE_OFF;
2897
    }
2898
    if (!file_opts->has_extent_size_hint) {
2899
        file_opts->extent_size_hint = 1 * MiB;
2900
    }
2901
    if (file_opts->extent_size_hint > UINT32_MAX) {
2902
        result = -EINVAL;
2903
        error_setg(errp, "Extent size hint is too large");
2904
        goto out;
2905
    }
2906

2907
    /* Create file */
2908
    fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp);
2909
    if (fd < 0) {
2910
        result = -errno;
2911
        goto out;
2912
    }
2913

2914
    /* Take permissions: We want to discard everything, so we need
2915
     * BLK_PERM_WRITE; and truncation to the desired size requires
2916
     * BLK_PERM_RESIZE.
2917
     * On the other hand, we cannot share the RESIZE permission
2918
     * because we promise that after this function, the file has the
2919
     * size given in the options.  If someone else were to resize it
2920
     * concurrently, we could not guarantee that.
2921
     * Note that after this function, we can no longer guarantee that
2922
     * the file is not touched by a third party, so it may be resized
2923
     * then. */
2924
    perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2925
    shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
2926

2927
    /* Step one: Take locks */
2928
    result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
2929
    if (result < 0) {
2930
        goto out_close;
2931
    }
2932

2933
    /* Step two: Check that nobody else has taken conflicting locks */
2934
    result = raw_check_lock_bytes(fd, perm, shared, errp);
2935
    if (result < 0) {
2936
        error_append_hint(errp,
2937
                          "Is another process using the image [%s]?\n",
2938
                          file_opts->filename);
2939
        goto out_unlock;
2940
    }
2941

2942
    /* Clear the file by truncating it to 0 */
2943
    result = raw_regular_truncate(NULL, fd, 0, PREALLOC_MODE_OFF, errp);
2944
    if (result < 0) {
2945
        goto out_unlock;
2946
    }
2947

2948
    if (file_opts->nocow) {
2949
#ifdef __linux__
2950
        /* Set NOCOW flag to solve performance issue on fs like btrfs.
2951
         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
2952
         * will be ignored since any failure of this operation should not
2953
         * block the left work.
2954
         */
2955
        int attr;
2956
        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
2957
            attr |= FS_NOCOW_FL;
2958
            ioctl(fd, FS_IOC_SETFLAGS, &attr);
2959
        }
2960
#endif
2961
    }
2962
#ifdef FS_IOC_FSSETXATTR
2963
    /*
2964
     * Try to set the extent size hint. Failure is not fatal, and a warning is
2965
     * only printed if the option was explicitly specified.
2966
     */
2967
    {
2968
        struct fsxattr attr;
2969
        result = ioctl(fd, FS_IOC_FSGETXATTR, &attr);
2970
        if (result == 0) {
2971
            attr.fsx_xflags |= FS_XFLAG_EXTSIZE;
2972
            attr.fsx_extsize = file_opts->extent_size_hint;
2973
            result = ioctl(fd, FS_IOC_FSSETXATTR, &attr);
2974
        }
2975
        if (result < 0 && file_opts->has_extent_size_hint &&
2976
            file_opts->extent_size_hint)
2977
        {
2978
            warn_report("Failed to set extent size hint: %s",
2979
                        strerror(errno));
2980
        }
2981
    }
2982
#endif
2983

2984
    /* Resize and potentially preallocate the file to the desired
2985
     * final size */
2986
    result = raw_regular_truncate(NULL, fd, file_opts->size,
2987
                                  file_opts->preallocation, errp);
2988
    if (result < 0) {
2989
        goto out_unlock;
2990
    }
2991

2992
out_unlock:
2993
    raw_apply_lock_bytes(NULL, fd, 0, 0, true, &local_err);
2994
    if (local_err) {
2995
        /* The above call should not fail, and if it does, that does
2996
         * not mean the whole creation operation has failed.  So
2997
         * report it the user for their convenience, but do not report
2998
         * it to the caller. */
2999
        warn_report_err(local_err);
3000
    }
3001

3002
out_close:
3003
    if (qemu_close(fd) != 0 && result == 0) {
3004
        result = -errno;
3005
        error_setg_errno(errp, -result, "Could not close the new file");
3006
    }
3007
out:
3008
    return result;
3009
}
3010

3011
static int coroutine_fn GRAPH_RDLOCK
3012
raw_co_create_opts(BlockDriver *drv, const char *filename,
3013
                   QemuOpts *opts, Error **errp)
3014
{
3015
    BlockdevCreateOptions options;
3016
    int64_t total_size = 0;
3017
    int64_t extent_size_hint = 0;
3018
    bool has_extent_size_hint = false;
3019
    bool nocow = false;
3020
    PreallocMode prealloc;
3021
    char *buf = NULL;
3022
    Error *local_err = NULL;
3023

3024
    /* Skip file: protocol prefix */
3025
    strstart(filename, "file:", &filename);
3026

3027
    /* Read out options */
3028
    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
3029
                          BDRV_SECTOR_SIZE);
3030
    if (qemu_opt_get(opts, BLOCK_OPT_EXTENT_SIZE_HINT)) {
3031
        has_extent_size_hint = true;
3032
        extent_size_hint =
3033
            qemu_opt_get_size_del(opts, BLOCK_OPT_EXTENT_SIZE_HINT, -1);
3034
    }
3035
    nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
3036
    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
3037
    prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
3038
                               PREALLOC_MODE_OFF, &local_err);
3039
    g_free(buf);
3040
    if (local_err) {
3041
        error_propagate(errp, local_err);
3042
        return -EINVAL;
3043
    }
3044

3045
    options = (BlockdevCreateOptions) {
3046
        .driver     = BLOCKDEV_DRIVER_FILE,
3047
        .u.file     = {
3048
            .filename           = (char *) filename,
3049
            .size               = total_size,
3050
            .has_preallocation  = true,
3051
            .preallocation      = prealloc,
3052
            .has_nocow          = true,
3053
            .nocow              = nocow,
3054
            .has_extent_size_hint = has_extent_size_hint,
3055
            .extent_size_hint   = extent_size_hint,
3056
        },
3057
    };
3058
    return raw_co_create(&options, errp);
3059
}
3060

3061
static int coroutine_fn raw_co_delete_file(BlockDriverState *bs,
3062
                                           Error **errp)
3063
{
3064
    struct stat st;
3065
    int ret;
3066

3067
    if (!(stat(bs->filename, &st) == 0) || !S_ISREG(st.st_mode)) {
3068
        error_setg_errno(errp, ENOENT, "%s is not a regular file",
3069
                         bs->filename);
3070
        return -ENOENT;
3071
    }
3072

3073
    ret = unlink(bs->filename);
3074
    if (ret < 0) {
3075
        ret = -errno;
3076
        error_setg_errno(errp, -ret, "Error when deleting file %s",
3077
                         bs->filename);
3078
    }
3079

3080
    return ret;
3081
}
3082

3083
/*
3084
 * Find allocation range in @bs around offset @start.
3085
 * May change underlying file descriptor's file offset.
3086
 * If @start is not in a hole, store @start in @data, and the
3087
 * beginning of the next hole in @hole, and return 0.
3088
 * If @start is in a non-trailing hole, store @start in @hole and the
3089
 * beginning of the next non-hole in @data, and return 0.
3090
 * If @start is in a trailing hole or beyond EOF, return -ENXIO.
3091
 * If we can't find out, return a negative errno other than -ENXIO.
3092
 */
3093
static int find_allocation(BlockDriverState *bs, off_t start,
3094
                           off_t *data, off_t *hole)
3095
{
3096
#if defined SEEK_HOLE && defined SEEK_DATA
3097
    BDRVRawState *s = bs->opaque;
3098
    off_t offs;
3099

3100
    /*
3101
     * SEEK_DATA cases:
3102
     * D1. offs == start: start is in data
3103
     * D2. offs > start: start is in a hole, next data at offs
3104
     * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
3105
     *                              or start is beyond EOF
3106
     *     If the latter happens, the file has been truncated behind
3107
     *     our back since we opened it.  All bets are off then.
3108
     *     Treating like a trailing hole is simplest.
3109
     * D4. offs < 0, errno != ENXIO: we learned nothing
3110
     */
3111
    offs = lseek(s->fd, start, SEEK_DATA);
3112
    if (offs < 0) {
3113
        return -errno;          /* D3 or D4 */
3114
    }
3115

3116
    if (offs < start) {
3117
        /* This is not a valid return by lseek().  We are safe to just return
3118
         * -EIO in this case, and we'll treat it like D4. */
3119
        return -EIO;
3120
    }
3121

3122
    if (offs > start) {
3123
        /* D2: in hole, next data at offs */
3124
        *hole = start;
3125
        *data = offs;
3126
        return 0;
3127
    }
3128

3129
    /* D1: in data, end not yet known */
3130

3131
    /*
3132
     * SEEK_HOLE cases:
3133
     * H1. offs == start: start is in a hole
3134
     *     If this happens here, a hole has been dug behind our back
3135
     *     since the previous lseek().
3136
     * H2. offs > start: either start is in data, next hole at offs,
3137
     *                   or start is in trailing hole, EOF at offs
3138
     *     Linux treats trailing holes like any other hole: offs ==
3139
     *     start.  Solaris seeks to EOF instead: offs > start (blech).
3140
     *     If that happens here, a hole has been dug behind our back
3141
     *     since the previous lseek().
3142
     * H3. offs < 0, errno = ENXIO: start is beyond EOF
3143
     *     If this happens, the file has been truncated behind our
3144
     *     back since we opened it.  Treat it like a trailing hole.
3145
     * H4. offs < 0, errno != ENXIO: we learned nothing
3146
     *     Pretend we know nothing at all, i.e. "forget" about D1.
3147
     */
3148
    offs = lseek(s->fd, start, SEEK_HOLE);
3149
    if (offs < 0) {
3150
        return -errno;          /* D1 and (H3 or H4) */
3151
    }
3152

3153
    if (offs < start) {
3154
        /* This is not a valid return by lseek().  We are safe to just return
3155
         * -EIO in this case, and we'll treat it like H4. */
3156
        return -EIO;
3157
    }
3158

3159
    if (offs > start) {
3160
        /*
3161
         * D1 and H2: either in data, next hole at offs, or it was in
3162
         * data but is now in a trailing hole.  In the latter case,
3163
         * all bets are off.  Treating it as if it there was data all
3164
         * the way to EOF is safe, so simply do that.
3165
         */
3166
        *data = start;
3167
        *hole = offs;
3168
        return 0;
3169
    }
3170

3171
    /* D1 and H1 */
3172
    return -EBUSY;
3173
#else
3174
    return -ENOTSUP;
3175
#endif
3176
}
3177

3178
/*
3179
 * Returns the allocation status of the specified offset.
3180
 *
3181
 * The block layer guarantees 'offset' and 'bytes' are within bounds.
3182
 *
3183
 * 'pnum' is set to the number of bytes (including and immediately following
3184
 * the specified offset) that are known to be in the same
3185
 * allocated/unallocated state.
3186
 *
3187
 * 'bytes' is a soft cap for 'pnum'.  If the information is free, 'pnum' may
3188
 * well exceed it.
3189
 */
3190
static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
3191
                                            bool want_zero,
3192
                                            int64_t offset,
3193
                                            int64_t bytes, int64_t *pnum,
3194
                                            int64_t *map,
3195
                                            BlockDriverState **file)
3196
{
3197
    off_t data = 0, hole = 0;
3198
    int ret;
3199

3200
    assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
3201

3202
    ret = fd_open(bs);
3203
    if (ret < 0) {
3204
        return ret;
3205
    }
3206

3207
    if (!want_zero) {
3208
        *pnum = bytes;
3209
        *map = offset;
3210
        *file = bs;
3211
        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
3212
    }
3213

3214
    ret = find_allocation(bs, offset, &data, &hole);
3215
    if (ret == -ENXIO) {
3216
        /* Trailing hole */
3217
        *pnum = bytes;
3218
        ret = BDRV_BLOCK_ZERO;
3219
    } else if (ret < 0) {
3220
        /* No info available, so pretend there are no holes */
3221
        *pnum = bytes;
3222
        ret = BDRV_BLOCK_DATA;
3223
    } else if (data == offset) {
3224
        /* On a data extent, compute bytes to the end of the extent,
3225
         * possibly including a partial sector at EOF. */
3226
        *pnum = hole - offset;
3227

3228
        /*
3229
         * We are not allowed to return partial sectors, though, so
3230
         * round up if necessary.
3231
         */
3232
        if (!QEMU_IS_ALIGNED(*pnum, bs->bl.request_alignment)) {
3233
            int64_t file_length = raw_getlength(bs);
3234
            if (file_length > 0) {
3235
                /* Ignore errors, this is just a safeguard */
3236
                assert(hole == file_length);
3237
            }
3238
            *pnum = ROUND_UP(*pnum, bs->bl.request_alignment);
3239
        }
3240

3241
        ret = BDRV_BLOCK_DATA;
3242
    } else {
3243
        /* On a hole, compute bytes to the beginning of the next extent.  */
3244
        assert(hole == offset);
3245
        *pnum = data - offset;
3246
        ret = BDRV_BLOCK_ZERO;
3247
    }
3248
    *map = offset;
3249
    *file = bs;
3250
    return ret | BDRV_BLOCK_OFFSET_VALID;
3251
}
3252

3253
#if defined(__linux__)
3254
/* Verify that the file is not in the page cache */
3255
static void check_cache_dropped(BlockDriverState *bs, Error **errp)
3256
{
3257
    const size_t window_size = 128 * 1024 * 1024;
3258
    BDRVRawState *s = bs->opaque;
3259
    void *window = NULL;
3260
    size_t length = 0;
3261
    unsigned char *vec;
3262
    size_t page_size;
3263
    off_t offset;
3264
    off_t end;
3265

3266
    /* mincore(2) page status information requires 1 byte per page */
3267
    page_size = sysconf(_SC_PAGESIZE);
3268
    vec = g_malloc(DIV_ROUND_UP(window_size, page_size));
3269

3270
    end = raw_getlength(bs);
3271

3272
    for (offset = 0; offset < end; offset += window_size) {
3273
        void *new_window;
3274
        size_t new_length;
3275
        size_t vec_end;
3276
        size_t i;
3277
        int ret;
3278

3279
        /* Unmap previous window if size has changed */
3280
        new_length = MIN(end - offset, window_size);
3281
        if (new_length != length) {
3282
            munmap(window, length);
3283
            window = NULL;
3284
            length = 0;
3285
        }
3286

3287
        new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
3288
                          s->fd, offset);
3289
        if (new_window == MAP_FAILED) {
3290
            error_setg_errno(errp, errno, "mmap failed");
3291
            break;
3292
        }
3293

3294
        window = new_window;
3295
        length = new_length;
3296

3297
        ret = mincore(window, length, vec);
3298
        if (ret < 0) {
3299
            error_setg_errno(errp, errno, "mincore failed");
3300
            break;
3301
        }
3302

3303
        vec_end = DIV_ROUND_UP(length, page_size);
3304
        for (i = 0; i < vec_end; i++) {
3305
            if (vec[i] & 0x1) {
3306
                break;
3307
            }
3308
        }
3309
        if (i < vec_end) {
3310
            error_setg(errp, "page cache still in use!");
3311
            break;
3312
        }
3313
    }
3314

3315
    if (window) {
3316
        munmap(window, length);
3317
    }
3318

3319
    g_free(vec);
3320
}
3321
#endif /* __linux__ */
3322

3323
static void coroutine_fn GRAPH_RDLOCK
3324
raw_co_invalidate_cache(BlockDriverState *bs, Error **errp)
3325
{
3326
    BDRVRawState *s = bs->opaque;
3327
    int ret;
3328

3329
    ret = fd_open(bs);
3330
    if (ret < 0) {
3331
        error_setg_errno(errp, -ret, "The file descriptor is not open");
3332
        return;
3333
    }
3334

3335
    if (!s->drop_cache) {
3336
        return;
3337
    }
3338

3339
    if (s->open_flags & O_DIRECT) {
3340
        return; /* No host kernel page cache */
3341
    }
3342

3343
#if defined(__linux__)
3344
    /* This sets the scene for the next syscall... */
3345
    ret = bdrv_co_flush(bs);
3346
    if (ret < 0) {
3347
        error_setg_errno(errp, -ret, "flush failed");
3348
        return;
3349
    }
3350

3351
    /* Linux does not invalidate pages that are dirty, locked, or mmapped by a
3352
     * process.  These limitations are okay because we just fsynced the file,
3353
     * we don't use mmap, and the file should not be in use by other processes.
3354
     */
3355
    ret = posix_fadvise(s->fd, 0, 0, POSIX_FADV_DONTNEED);
3356
    if (ret != 0) { /* the return value is a positive errno */
3357
        error_setg_errno(errp, ret, "fadvise failed");
3358
        return;
3359
    }
3360

3361
    if (s->check_cache_dropped) {
3362
        check_cache_dropped(bs, errp);
3363
    }
3364
#else /* __linux__ */
3365
    /* Do nothing.  Live migration to a remote host with cache.direct=off is
3366
     * unsupported on other host operating systems.  Cache consistency issues
3367
     * may occur but no error is reported here, partly because that's the
3368
     * historical behavior and partly because it's hard to differentiate valid
3369
     * configurations that should not cause errors.
3370
     */
3371
#endif /* !__linux__ */
3372
}
3373

3374
static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
3375
{
3376
    if (ret) {
3377
        s->stats.discard_nb_failed++;
3378
    } else {
3379
        s->stats.discard_nb_ok++;
3380
        s->stats.discard_bytes_ok += nbytes;
3381
    }
3382
}
3383

3384
/*
3385
 * zone report - Get a zone block device's information in the form
3386
 * of an array of zone descriptors.
3387
 * zones is an array of zone descriptors to hold zone information on reply;
3388
 * offset can be any byte within the entire size of the device;
3389
 * nr_zones is the maximum number of sectors the command should operate on.
3390
 */
3391
#if defined(CONFIG_BLKZONED)
3392
static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
3393
                                           unsigned int *nr_zones,
3394
                                           BlockZoneDescriptor *zones) {
3395
    BDRVRawState *s = bs->opaque;
3396
    RawPosixAIOData acb = (RawPosixAIOData) {
3397
        .bs         = bs,
3398
        .aio_fildes = s->fd,
3399
        .aio_type   = QEMU_AIO_ZONE_REPORT,
3400
        .aio_offset = offset,
3401
        .zone_report    = {
3402
            .nr_zones       = nr_zones,
3403
            .zones          = zones,
3404
        },
3405
    };
3406

3407
    trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS);
3408
    return raw_thread_pool_submit(handle_aiocb_zone_report, &acb);
3409
}
3410
#endif
3411

3412
/*
3413
 * zone management operations - Execute an operation on a zone
3414
 */
3415
#if defined(CONFIG_BLKZONED)
3416
static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
3417
        int64_t offset, int64_t len) {
3418
    BDRVRawState *s = bs->opaque;
3419
    RawPosixAIOData acb;
3420
    int64_t zone_size, zone_size_mask;
3421
    const char *op_name;
3422
    unsigned long zo;
3423
    int ret;
3424
    BlockZoneWps *wps = bs->wps;
3425
    int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
3426

3427
    zone_size = bs->bl.zone_size;
3428
    zone_size_mask = zone_size - 1;
3429
    if (offset & zone_size_mask) {
3430
        error_report("sector offset %" PRId64 " is not aligned to zone size "
3431
                     "%" PRId64 "", offset / 512, zone_size / 512);
3432
        return -EINVAL;
3433
    }
3434

3435
    if (((offset + len) < capacity && len & zone_size_mask) ||
3436
        offset + len > capacity) {
3437
        error_report("number of sectors %" PRId64 " is not aligned to zone size"
3438
                      " %" PRId64 "", len / 512, zone_size / 512);
3439
        return -EINVAL;
3440
    }
3441

3442
    uint32_t i = offset / bs->bl.zone_size;
3443
    uint32_t nrz = len / bs->bl.zone_size;
3444
    uint64_t *wp = &wps->wp[i];
3445
    if (BDRV_ZT_IS_CONV(*wp) && len != capacity) {
3446
        error_report("zone mgmt operations are not allowed for conventional zones");
3447
        return -EIO;
3448
    }
3449

3450
    switch (op) {
3451
    case BLK_ZO_OPEN:
3452
        op_name = "BLKOPENZONE";
3453
        zo = BLKOPENZONE;
3454
        break;
3455
    case BLK_ZO_CLOSE:
3456
        op_name = "BLKCLOSEZONE";
3457
        zo = BLKCLOSEZONE;
3458
        break;
3459
    case BLK_ZO_FINISH:
3460
        op_name = "BLKFINISHZONE";
3461
        zo = BLKFINISHZONE;
3462
        break;
3463
    case BLK_ZO_RESET:
3464
        op_name = "BLKRESETZONE";
3465
        zo = BLKRESETZONE;
3466
        break;
3467
    default:
3468
        error_report("Unsupported zone op: 0x%x", op);
3469
        return -ENOTSUP;
3470
    }
3471

3472
    acb = (RawPosixAIOData) {
3473
        .bs             = bs,
3474
        .aio_fildes     = s->fd,
3475
        .aio_type       = QEMU_AIO_ZONE_MGMT,
3476
        .aio_offset     = offset,
3477
        .aio_nbytes     = len,
3478
        .zone_mgmt  = {
3479
            .op = zo,
3480
        },
3481
    };
3482

3483
    trace_zbd_zone_mgmt(bs, op_name, offset >> BDRV_SECTOR_BITS,
3484
                        len >> BDRV_SECTOR_BITS);
3485
    ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
3486
    if (ret != 0) {
3487
        update_zones_wp(bs, s->fd, offset, nrz);
3488
        error_report("ioctl %s failed %d", op_name, ret);
3489
        return ret;
3490
    }
3491

3492
    if (zo == BLKRESETZONE && len == capacity) {
3493
        ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 1);
3494
        if (ret < 0) {
3495
            error_report("reporting single wp failed");
3496
            return ret;
3497
        }
3498
    } else if (zo == BLKRESETZONE) {
3499
        for (unsigned int j = 0; j < nrz; ++j) {
3500
            wp[j] = offset + j * zone_size;
3501
        }
3502
    } else if (zo == BLKFINISHZONE) {
3503
        for (unsigned int j = 0; j < nrz; ++j) {
3504
            /* The zoned device allows the last zone smaller that the
3505
             * zone size. */
3506
            wp[j] = MIN(offset + (j + 1) * zone_size, offset + len);
3507
        }
3508
    }
3509

3510
    return ret;
3511
}
3512
#endif
3513

3514
#if defined(CONFIG_BLKZONED)
3515
static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
3516
                                           int64_t *offset,
3517
                                           QEMUIOVector *qiov,
3518
                                           BdrvRequestFlags flags) {
3519
    assert(flags == 0);
3520
    int64_t zone_size_mask = bs->bl.zone_size - 1;
3521
    int64_t iov_len = 0;
3522
    int64_t len = 0;
3523

3524
    if (*offset & zone_size_mask) {
3525
        error_report("sector offset %" PRId64 " is not aligned to zone size "
3526
                     "%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
3527
        return -EINVAL;
3528
    }
3529

3530
    int64_t wg = bs->bl.write_granularity;
3531
    int64_t wg_mask = wg - 1;
3532
    for (int i = 0; i < qiov->niov; i++) {
3533
        iov_len = qiov->iov[i].iov_len;
3534
        if (iov_len & wg_mask) {
3535
            error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
3536
                         "block size %" PRId64 "", i, iov_len, wg);
3537
            return -EINVAL;
3538
        }
3539
        len += iov_len;
3540
    }
3541

3542
    trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
3543
    return raw_co_prw(bs, offset, len, qiov, QEMU_AIO_ZONE_APPEND);
3544
}
3545
#endif
3546

3547
static coroutine_fn int
3548
raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
3549
                bool blkdev)
3550
{
3551
    BDRVRawState *s = bs->opaque;
3552
    RawPosixAIOData acb;
3553
    int ret;
3554

3555
    acb = (RawPosixAIOData) {
3556
        .bs             = bs,
3557
        .aio_fildes     = s->fd,
3558
        .aio_type       = QEMU_AIO_DISCARD,
3559
        .aio_offset     = offset,
3560
        .aio_nbytes     = bytes,
3561
    };
3562

3563
    if (blkdev) {
3564
        acb.aio_type |= QEMU_AIO_BLKDEV;
3565
    }
3566

3567
    ret = raw_thread_pool_submit(handle_aiocb_discard, &acb);
3568
    raw_account_discard(s, bytes, ret);
3569
    return ret;
3570
}
3571

3572
static coroutine_fn int
3573
raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
3574
{
3575
    return raw_do_pdiscard(bs, offset, bytes, false);
3576
}
3577

3578
static int coroutine_fn
3579
raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
3580
                     BdrvRequestFlags flags, bool blkdev)
3581
{
3582
    BDRVRawState *s = bs->opaque;
3583
    RawPosixAIOData acb;
3584
    ThreadPoolFunc *handler;
3585

3586
#ifdef CONFIG_FALLOCATE
3587
    if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
3588
        BdrvTrackedRequest *req;
3589

3590
        /*
3591
         * This is a workaround for a bug in the Linux XFS driver,
3592
         * where writes submitted through the AIO interface will be
3593
         * discarded if they happen beyond a concurrently running
3594
         * fallocate() that increases the file length (i.e., both the
3595
         * write and the fallocate() happen beyond the EOF).
3596
         *
3597
         * To work around it, we extend the tracked request for this
3598
         * zero write until INT64_MAX (effectively infinity), and mark
3599
         * it as serializing.
3600
         *
3601
         * We have to enable this workaround for all filesystems and
3602
         * AIO modes (not just XFS with aio=native), because for
3603
         * remote filesystems we do not know the host configuration.
3604
         */
3605

3606
        req = bdrv_co_get_self_request(bs);
3607
        assert(req);
3608
        assert(req->type == BDRV_TRACKED_WRITE);
3609
        assert(req->offset <= offset);
3610
        assert(req->offset + req->bytes >= offset + bytes);
3611

3612
        req->bytes = BDRV_MAX_LENGTH - req->offset;
3613

3614
        bdrv_check_request(req->offset, req->bytes, &error_abort);
3615

3616
        bdrv_make_request_serialising(req, bs->bl.request_alignment);
3617
    }
3618
#endif
3619

3620
    acb = (RawPosixAIOData) {
3621
        .bs             = bs,
3622
        .aio_fildes     = s->fd,
3623
        .aio_type       = QEMU_AIO_WRITE_ZEROES,
3624
        .aio_offset     = offset,
3625
        .aio_nbytes     = bytes,
3626
    };
3627

3628
    if (blkdev) {
3629
        acb.aio_type |= QEMU_AIO_BLKDEV;
3630
    }
3631
    if (flags & BDRV_REQ_NO_FALLBACK) {
3632
        acb.aio_type |= QEMU_AIO_NO_FALLBACK;
3633
    }
3634

3635
    if (flags & BDRV_REQ_MAY_UNMAP) {
3636
        acb.aio_type |= QEMU_AIO_DISCARD;
3637
        handler = handle_aiocb_write_zeroes_unmap;
3638
    } else {
3639
        handler = handle_aiocb_write_zeroes;
3640
    }
3641

3642
    return raw_thread_pool_submit(handler, &acb);
3643
}
3644

3645
static int coroutine_fn raw_co_pwrite_zeroes(
3646
    BlockDriverState *bs, int64_t offset,
3647
    int64_t bytes, BdrvRequestFlags flags)
3648
{
3649
    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, false);
3650
}
3651

3652
static int coroutine_fn
3653
raw_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
3654
{
3655
    return 0;
3656
}
3657

3658
static ImageInfoSpecific *raw_get_specific_info(BlockDriverState *bs,
3659
                                                Error **errp)
3660
{
3661
    ImageInfoSpecificFile *file_info = g_new0(ImageInfoSpecificFile, 1);
3662
    ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1);
3663

3664
    *spec_info = (ImageInfoSpecific){
3665
        .type = IMAGE_INFO_SPECIFIC_KIND_FILE,
3666
        .u.file.data = file_info,
3667
    };
3668

3669
#ifdef FS_IOC_FSGETXATTR
3670
    {
3671
        BDRVRawState *s = bs->opaque;
3672
        struct fsxattr attr;
3673
        int ret;
3674

3675
        ret = ioctl(s->fd, FS_IOC_FSGETXATTR, &attr);
3676
        if (!ret && attr.fsx_extsize != 0) {
3677
            file_info->has_extent_size_hint = true;
3678
            file_info->extent_size_hint = attr.fsx_extsize;
3679
        }
3680
    }
3681
#endif
3682

3683
    return spec_info;
3684
}
3685

3686
static BlockStatsSpecificFile get_blockstats_specific_file(BlockDriverState *bs)
3687
{
3688
    BDRVRawState *s = bs->opaque;
3689
    return (BlockStatsSpecificFile) {
3690
        .discard_nb_ok = s->stats.discard_nb_ok,
3691
        .discard_nb_failed = s->stats.discard_nb_failed,
3692
        .discard_bytes_ok = s->stats.discard_bytes_ok,
3693
    };
3694
}
3695

3696
static BlockStatsSpecific *raw_get_specific_stats(BlockDriverState *bs)
3697
{
3698
    BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3699

3700
    stats->driver = BLOCKDEV_DRIVER_FILE;
3701
    stats->u.file = get_blockstats_specific_file(bs);
3702

3703
    return stats;
3704
}
3705

3706
#if defined(HAVE_HOST_BLOCK_DEVICE)
3707
static BlockStatsSpecific *hdev_get_specific_stats(BlockDriverState *bs)
3708
{
3709
    BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
3710

3711
    stats->driver = BLOCKDEV_DRIVER_HOST_DEVICE;
3712
    stats->u.host_device = get_blockstats_specific_file(bs);
3713

3714
    return stats;
3715
}
3716
#endif /* HAVE_HOST_BLOCK_DEVICE */
3717

3718
static QemuOptsList raw_create_opts = {
3719
    .name = "raw-create-opts",
3720
    .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
3721
    .desc = {
3722
        {
3723
            .name = BLOCK_OPT_SIZE,
3724
            .type = QEMU_OPT_SIZE,
3725
            .help = "Virtual disk size"
3726
        },
3727
        {
3728
            .name = BLOCK_OPT_NOCOW,
3729
            .type = QEMU_OPT_BOOL,
3730
            .help = "Turn off copy-on-write (valid only on btrfs)"
3731
        },
3732
        {
3733
            .name = BLOCK_OPT_PREALLOC,
3734
            .type = QEMU_OPT_STRING,
3735
            .help = "Preallocation mode (allowed values: off"
3736
#ifdef CONFIG_POSIX_FALLOCATE
3737
                    ", falloc"
3738
#endif
3739
                    ", full)"
3740
        },
3741
        {
3742
            .name = BLOCK_OPT_EXTENT_SIZE_HINT,
3743
            .type = QEMU_OPT_SIZE,
3744
            .help = "Extent size hint for the image file, 0 to disable"
3745
        },
3746
        { /* end of list */ }
3747
    }
3748
};
3749

3750
static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
3751
                          Error **errp)
3752
{
3753
    BDRVRawState *s = bs->opaque;
3754
    int input_flags = s->reopen_state ? s->reopen_state->flags : bs->open_flags;
3755
    int open_flags;
3756
    int ret;
3757

3758
    /* We may need a new fd if auto-read-only switches the mode */
3759
    ret = raw_reconfigure_getfd(bs, input_flags, &open_flags, perm, errp);
3760
    if (ret < 0) {
3761
        return ret;
3762
    } else if (ret != s->fd) {
3763
        Error *local_err = NULL;
3764

3765
        /*
3766
         * Fail already check_perm() if we can't get a working O_DIRECT
3767
         * alignment with the new fd.
3768
         */
3769
        raw_probe_alignment(bs, ret, &local_err);
3770
        if (local_err) {
3771
            error_propagate(errp, local_err);
3772
            return -EINVAL;
3773
        }
3774

3775
        s->perm_change_fd = ret;
3776
        s->perm_change_flags = open_flags;
3777
    }
3778

3779
    /* Prepare permissions on old fd to avoid conflicts between old and new,
3780
     * but keep everything locked that new will need. */
3781
    ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
3782
    if (ret < 0) {
3783
        goto fail;
3784
    }
3785

3786
    /* Copy locks to the new fd */
3787
    if (s->perm_change_fd && s->use_lock) {
3788
        ret = raw_apply_lock_bytes(NULL, s->perm_change_fd, perm, ~shared,
3789
                                   false, errp);
3790
        if (ret < 0) {
3791
            raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3792
            goto fail;
3793
        }
3794
    }
3795
    return 0;
3796

3797
fail:
3798
    if (s->perm_change_fd) {
3799
        qemu_close(s->perm_change_fd);
3800
    }
3801
    s->perm_change_fd = 0;
3802
    return ret;
3803
}
3804

3805
static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
3806
{
3807
    BDRVRawState *s = bs->opaque;
3808

3809
    /* For reopen, we have already switched to the new fd (.bdrv_set_perm is
3810
     * called after .bdrv_reopen_commit) */
3811
    if (s->perm_change_fd && s->fd != s->perm_change_fd) {
3812
        qemu_close(s->fd);
3813
        s->fd = s->perm_change_fd;
3814
        s->open_flags = s->perm_change_flags;
3815
    }
3816
    s->perm_change_fd = 0;
3817

3818
    raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
3819
    s->perm = perm;
3820
    s->shared_perm = shared;
3821
}
3822

3823
static void raw_abort_perm_update(BlockDriverState *bs)
3824
{
3825
    BDRVRawState *s = bs->opaque;
3826

3827
    /* For reopen, .bdrv_reopen_abort is called afterwards and will close
3828
     * the file descriptor. */
3829
    if (s->perm_change_fd) {
3830
        qemu_close(s->perm_change_fd);
3831
    }
3832
    s->perm_change_fd = 0;
3833

3834
    raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
3835
}
3836

3837
static int coroutine_fn GRAPH_RDLOCK raw_co_copy_range_from(
3838
        BlockDriverState *bs, BdrvChild *src, int64_t src_offset,
3839
        BdrvChild *dst, int64_t dst_offset, int64_t bytes,
3840
        BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
3841
{
3842
    return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3843
                                 read_flags, write_flags);
3844
}
3845

3846
static int coroutine_fn GRAPH_RDLOCK
3847
raw_co_copy_range_to(BlockDriverState *bs,
3848
                     BdrvChild *src, int64_t src_offset,
3849
                     BdrvChild *dst, int64_t dst_offset,
3850
                     int64_t bytes, BdrvRequestFlags read_flags,
3851
                     BdrvRequestFlags write_flags)
3852
{
3853
    RawPosixAIOData acb;
3854
    BDRVRawState *s = bs->opaque;
3855
    BDRVRawState *src_s;
3856

3857
    assert(dst->bs == bs);
3858
    if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
3859
        return -ENOTSUP;
3860
    }
3861

3862
    src_s = src->bs->opaque;
3863
    if (fd_open(src->bs) < 0 || fd_open(dst->bs) < 0) {
3864
        return -EIO;
3865
    }
3866

3867
    acb = (RawPosixAIOData) {
3868
        .bs             = bs,
3869
        .aio_type       = QEMU_AIO_COPY_RANGE,
3870
        .aio_fildes     = src_s->fd,
3871
        .aio_offset     = src_offset,
3872
        .aio_nbytes     = bytes,
3873
        .copy_range     = {
3874
            .aio_fd2        = s->fd,
3875
            .aio_offset2    = dst_offset,
3876
        },
3877
    };
3878

3879
    return raw_thread_pool_submit(handle_aiocb_copy_range, &acb);
3880
}
3881

3882
BlockDriver bdrv_file = {
3883
    .format_name = "file",
3884
    .protocol_name = "file",
3885
    .instance_size = sizeof(BDRVRawState),
3886
    .bdrv_needs_filename = true,
3887
    .bdrv_probe = NULL, /* no probe for protocols */
3888
    .bdrv_parse_filename = raw_parse_filename,
3889
    .bdrv_open      = raw_open,
3890
    .bdrv_reopen_prepare = raw_reopen_prepare,
3891
    .bdrv_reopen_commit = raw_reopen_commit,
3892
    .bdrv_reopen_abort = raw_reopen_abort,
3893
    .bdrv_close = raw_close,
3894
    .bdrv_co_create = raw_co_create,
3895
    .bdrv_co_create_opts = raw_co_create_opts,
3896
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
3897
    .bdrv_co_block_status = raw_co_block_status,
3898
    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
3899
    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,
3900
    .bdrv_co_delete_file = raw_co_delete_file,
3901

3902
    .bdrv_co_preadv         = raw_co_preadv,
3903
    .bdrv_co_pwritev        = raw_co_pwritev,
3904
    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
3905
    .bdrv_co_pdiscard       = raw_co_pdiscard,
3906
    .bdrv_co_copy_range_from = raw_co_copy_range_from,
3907
    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
3908
    .bdrv_refresh_limits = raw_refresh_limits,
3909

3910
    .bdrv_co_truncate                   = raw_co_truncate,
3911
    .bdrv_co_getlength                  = raw_co_getlength,
3912
    .bdrv_co_get_info                   = raw_co_get_info,
3913
    .bdrv_get_specific_info             = raw_get_specific_info,
3914
    .bdrv_co_get_allocated_file_size    = raw_co_get_allocated_file_size,
3915
    .bdrv_get_specific_stats = raw_get_specific_stats,
3916
    .bdrv_check_perm = raw_check_perm,
3917
    .bdrv_set_perm   = raw_set_perm,
3918
    .bdrv_abort_perm_update = raw_abort_perm_update,
3919
    .create_opts = &raw_create_opts,
3920
    .mutable_opts = mutable_opts,
3921
};
3922

3923
/***********************************************/
3924
/* host device */
3925

3926
#if defined(HAVE_HOST_BLOCK_DEVICE)
3927

3928
#if defined(__APPLE__) && defined(__MACH__)
3929
static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3930
                                CFIndex maxPathSize, int flags);
3931

3932
static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator)
3933
{
3934
    kern_return_t kernResult = KERN_FAILURE;
3935
    mach_port_t mainPort;
3936
    CFMutableDictionaryRef  classesToMatch;
3937
    const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass};
3938
    char *mediaType = NULL;
3939

3940
    kernResult = IOMainPort(MACH_PORT_NULL, &mainPort);
3941
    if ( KERN_SUCCESS != kernResult ) {
3942
        printf("IOMainPort returned %d\n", kernResult);
3943
    }
3944

3945
    int index;
3946
    for (index = 0; index < ARRAY_SIZE(matching_array); index++) {
3947
        classesToMatch = IOServiceMatching(matching_array[index]);
3948
        if (classesToMatch == NULL) {
3949
            error_report("IOServiceMatching returned NULL for %s",
3950
                         matching_array[index]);
3951
            continue;
3952
        }
3953
        CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey),
3954
                             kCFBooleanTrue);
3955
        kernResult = IOServiceGetMatchingServices(mainPort, classesToMatch,
3956
                                                  mediaIterator);
3957
        if (kernResult != KERN_SUCCESS) {
3958
            error_report("Note: IOServiceGetMatchingServices returned %d",
3959
                         kernResult);
3960
            continue;
3961
        }
3962

3963
        /* If a match was found, leave the loop */
3964
        if (*mediaIterator != 0) {
3965
            trace_file_FindEjectableOpticalMedia(matching_array[index]);
3966
            mediaType = g_strdup(matching_array[index]);
3967
            break;
3968
        }
3969
    }
3970
    return mediaType;
3971
}
3972

3973
kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath,
3974
                         CFIndex maxPathSize, int flags)
3975
{
3976
    io_object_t     nextMedia;
3977
    kern_return_t   kernResult = KERN_FAILURE;
3978
    *bsdPath = '\0';
3979
    nextMedia = IOIteratorNext( mediaIterator );
3980
    if ( nextMedia )
3981
    {
3982
        CFTypeRef   bsdPathAsCFString;
3983
    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
3984
        if ( bsdPathAsCFString ) {
3985
            size_t devPathLength;
3986
            strcpy( bsdPath, _PATH_DEV );
3987
            if (flags & BDRV_O_NOCACHE) {
3988
                strcat(bsdPath, "r");
3989
            }
3990
            devPathLength = strlen( bsdPath );
3991
            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
3992
                kernResult = KERN_SUCCESS;
3993
            }
3994
            CFRelease( bsdPathAsCFString );
3995
        }
3996
        IOObjectRelease( nextMedia );
3997
    }
3998

3999
    return kernResult;
4000
}
4001

4002
/* Sets up a real cdrom for use in QEMU */
4003
static bool setup_cdrom(char *bsd_path, Error **errp)
4004
{
4005
    int index, num_of_test_partitions = 2, fd;
4006
    char test_partition[MAXPATHLEN];
4007
    bool partition_found = false;
4008

4009
    /* look for a working partition */
4010
    for (index = 0; index < num_of_test_partitions; index++) {
4011
        snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
4012
                 index);
4013
        fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE, NULL);
4014
        if (fd >= 0) {
4015
            partition_found = true;
4016
            qemu_close(fd);
4017
            break;
4018
        }
4019
    }
4020

4021
    /* if a working partition on the device was not found */
4022
    if (partition_found == false) {
4023
        error_setg(errp, "Failed to find a working partition on disc");
4024
    } else {
4025
        trace_file_setup_cdrom(test_partition);
4026
        pstrcpy(bsd_path, MAXPATHLEN, test_partition);
4027
    }
4028
    return partition_found;
4029
}
4030

4031
/* Prints directions on mounting and unmounting a device */
4032
static void print_unmounting_directions(const char *file_name)
4033
{
4034
    error_report("If device %s is mounted on the desktop, unmount"
4035
                 " it first before using it in QEMU", file_name);
4036
    error_report("Command to unmount device: diskutil unmountDisk %s",
4037
                 file_name);
4038
    error_report("Command to mount device: diskutil mountDisk %s", file_name);
4039
}
4040

4041
#endif /* defined(__APPLE__) && defined(__MACH__) */
4042

4043
static int hdev_probe_device(const char *filename)
4044
{
4045
    struct stat st;
4046

4047
    /* allow a dedicated CD-ROM driver to match with a higher priority */
4048
    if (strstart(filename, "/dev/cdrom", NULL))
4049
        return 50;
4050

4051
    if (stat(filename, &st) >= 0 &&
4052
            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
4053
        return 100;
4054
    }
4055

4056
    return 0;
4057
}
4058

4059
static void hdev_parse_filename(const char *filename, QDict *options,
4060
                                Error **errp)
4061
{
4062
    bdrv_parse_filename_strip_prefix(filename, "host_device:", options);
4063
}
4064

4065
static bool hdev_is_sg(BlockDriverState *bs)
4066
{
4067

4068
#if defined(__linux__)
4069

4070
    BDRVRawState *s = bs->opaque;
4071
    struct stat st;
4072
    struct sg_scsi_id scsiid;
4073
    int sg_version;
4074
    int ret;
4075

4076
    if (stat(bs->filename, &st) < 0 || !S_ISCHR(st.st_mode)) {
4077
        return false;
4078
    }
4079

4080
    ret = ioctl(s->fd, SG_GET_VERSION_NUM, &sg_version);
4081
    if (ret < 0) {
4082
        return false;
4083
    }
4084

4085
    ret = ioctl(s->fd, SG_GET_SCSI_ID, &scsiid);
4086
    if (ret >= 0) {
4087
        trace_file_hdev_is_sg(scsiid.scsi_type, sg_version);
4088
        return true;
4089
    }
4090

4091
#endif
4092

4093
    return false;
4094
}
4095

4096
static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
4097
                     Error **errp)
4098
{
4099
    BDRVRawState *s = bs->opaque;
4100
    int ret;
4101

4102
#if defined(__APPLE__) && defined(__MACH__)
4103
    /*
4104
     * Caution: while qdict_get_str() is fine, getting non-string types
4105
     * would require more care.  When @options come from -blockdev or
4106
     * blockdev_add, its members are typed according to the QAPI
4107
     * schema, but when they come from -drive, they're all QString.
4108
     */
4109
    const char *filename = qdict_get_str(options, "filename");
4110
    char bsd_path[MAXPATHLEN] = "";
4111
    bool error_occurred = false;
4112

4113
    /* If using a real cdrom */
4114
    if (strcmp(filename, "/dev/cdrom") == 0) {
4115
        char *mediaType = NULL;
4116
        kern_return_t ret_val;
4117
        io_iterator_t mediaIterator = 0;
4118

4119
        mediaType = FindEjectableOpticalMedia(&mediaIterator);
4120
        if (mediaType == NULL) {
4121
            error_setg(errp, "Please make sure your CD/DVD is in the optical"
4122
                       " drive");
4123
            error_occurred = true;
4124
            goto hdev_open_Mac_error;
4125
        }
4126

4127
        ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags);
4128
        if (ret_val != KERN_SUCCESS) {
4129
            error_setg(errp, "Could not get BSD path for optical drive");
4130
            error_occurred = true;
4131
            goto hdev_open_Mac_error;
4132
        }
4133

4134
        /* If a real optical drive was not found */
4135
        if (bsd_path[0] == '\0') {
4136
            error_setg(errp, "Failed to obtain bsd path for optical drive");
4137
            error_occurred = true;
4138
            goto hdev_open_Mac_error;
4139
        }
4140

4141
        /* If using a cdrom disc and finding a partition on the disc failed */
4142
        if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 &&
4143
            setup_cdrom(bsd_path, errp) == false) {
4144
            print_unmounting_directions(bsd_path);
4145
            error_occurred = true;
4146
            goto hdev_open_Mac_error;
4147
        }
4148

4149
        qdict_put_str(options, "filename", bsd_path);
4150

4151
hdev_open_Mac_error:
4152
        g_free(mediaType);
4153
        if (mediaIterator) {
4154
            IOObjectRelease(mediaIterator);
4155
        }
4156
        if (error_occurred) {
4157
            return -ENOENT;
4158
        }
4159
    }
4160
#endif /* defined(__APPLE__) && defined(__MACH__) */
4161

4162
    s->type = FTYPE_FILE;
4163

4164
    ret = raw_open_common(bs, options, flags, 0, true, errp);
4165
    if (ret < 0) {
4166
#if defined(__APPLE__) && defined(__MACH__)
4167
        if (*bsd_path) {
4168
            filename = bsd_path;
4169
        }
4170
        /* if a physical device experienced an error while being opened */
4171
        if (strncmp(filename, "/dev/", 5) == 0) {
4172
            print_unmounting_directions(filename);
4173
        }
4174
#endif /* defined(__APPLE__) && defined(__MACH__) */
4175
        return ret;
4176
    }
4177

4178
    /* Since this does ioctl the device must be already opened */
4179
    bs->sg = hdev_is_sg(bs);
4180

4181
    return ret;
4182
}
4183

4184
#if defined(__linux__)
4185
static int coroutine_fn
4186
hdev_co_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
4187
{
4188
    BDRVRawState *s = bs->opaque;
4189
    RawPosixAIOData acb;
4190
    int ret;
4191

4192
    ret = fd_open(bs);
4193
    if (ret < 0) {
4194
        return ret;
4195
    }
4196

4197
    if (req == SG_IO && s->pr_mgr) {
4198
        struct sg_io_hdr *io_hdr = buf;
4199
        if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
4200
            io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
4201
            return pr_manager_execute(s->pr_mgr, qemu_get_current_aio_context(),
4202
                                      s->fd, io_hdr);
4203
        }
4204
    }
4205

4206
    acb = (RawPosixAIOData) {
4207
        .bs         = bs,
4208
        .aio_type   = QEMU_AIO_IOCTL,
4209
        .aio_fildes = s->fd,
4210
        .aio_offset = 0,
4211
        .ioctl      = {
4212
            .buf        = buf,
4213
            .cmd        = req,
4214
        },
4215
    };
4216

4217
    return raw_thread_pool_submit(handle_aiocb_ioctl, &acb);
4218
}
4219
#endif /* linux */
4220

4221
static coroutine_fn int
4222
hdev_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
4223
{
4224
    BDRVRawState *s = bs->opaque;
4225
    int ret;
4226

4227
    ret = fd_open(bs);
4228
    if (ret < 0) {
4229
        raw_account_discard(s, bytes, ret);
4230
        return ret;
4231
    }
4232
    return raw_do_pdiscard(bs, offset, bytes, true);
4233
}
4234

4235
static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
4236
    int64_t offset, int64_t bytes, BdrvRequestFlags flags)
4237
{
4238
    int rc;
4239

4240
    rc = fd_open(bs);
4241
    if (rc < 0) {
4242
        return rc;
4243
    }
4244

4245
    return raw_do_pwrite_zeroes(bs, offset, bytes, flags, true);
4246
}
4247

4248
static BlockDriver bdrv_host_device = {
4249
    .format_name        = "host_device",
4250
    .protocol_name        = "host_device",
4251
    .instance_size      = sizeof(BDRVRawState),
4252
    .bdrv_needs_filename = true,
4253
    .bdrv_probe_device  = hdev_probe_device,
4254
    .bdrv_parse_filename = hdev_parse_filename,
4255
    .bdrv_open          = hdev_open,
4256
    .bdrv_close         = raw_close,
4257
    .bdrv_reopen_prepare = raw_reopen_prepare,
4258
    .bdrv_reopen_commit  = raw_reopen_commit,
4259
    .bdrv_reopen_abort   = raw_reopen_abort,
4260
    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
4261
    .create_opts         = &bdrv_create_opts_simple,
4262
    .mutable_opts        = mutable_opts,
4263
    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
4264
    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
4265

4266
    .bdrv_co_preadv         = raw_co_preadv,
4267
    .bdrv_co_pwritev        = raw_co_pwritev,
4268
    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
4269
    .bdrv_co_pdiscard       = hdev_co_pdiscard,
4270
    .bdrv_co_copy_range_from = raw_co_copy_range_from,
4271
    .bdrv_co_copy_range_to  = raw_co_copy_range_to,
4272
    .bdrv_refresh_limits = raw_refresh_limits,
4273

4274
    .bdrv_co_truncate                   = raw_co_truncate,
4275
    .bdrv_co_getlength                  = raw_co_getlength,
4276
    .bdrv_co_get_info                   = raw_co_get_info,
4277
    .bdrv_get_specific_info             = raw_get_specific_info,
4278
    .bdrv_co_get_allocated_file_size    = raw_co_get_allocated_file_size,
4279
    .bdrv_get_specific_stats = hdev_get_specific_stats,
4280
    .bdrv_check_perm = raw_check_perm,
4281
    .bdrv_set_perm   = raw_set_perm,
4282
    .bdrv_abort_perm_update = raw_abort_perm_update,
4283
    .bdrv_probe_blocksizes = hdev_probe_blocksizes,
4284
    .bdrv_probe_geometry = hdev_probe_geometry,
4285

4286
    /* generic scsi device */
4287
#ifdef __linux__
4288
    .bdrv_co_ioctl          = hdev_co_ioctl,
4289
#endif
4290

4291
    /* zoned device */
4292
#if defined(CONFIG_BLKZONED)
4293
    /* zone management operations */
4294
    .bdrv_co_zone_report = raw_co_zone_report,
4295
    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
4296
    .bdrv_co_zone_append = raw_co_zone_append,
4297
#endif
4298
};
4299

4300
#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
4301
static void cdrom_parse_filename(const char *filename, QDict *options,
4302
                                 Error **errp)
4303
{
4304
    bdrv_parse_filename_strip_prefix(filename, "host_cdrom:", options);
4305
}
4306

4307
static void cdrom_refresh_limits(BlockDriverState *bs, Error **errp)
4308
{
4309
    bs->bl.has_variable_length = true;
4310
    raw_refresh_limits(bs, errp);
4311
}
4312
#endif
4313

4314
#ifdef __linux__
4315
static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
4316
                      Error **errp)
4317
{
4318
    BDRVRawState *s = bs->opaque;
4319

4320
    s->type = FTYPE_CD;
4321

4322
    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
4323
    return raw_open_common(bs, options, flags, O_NONBLOCK, true, errp);
4324
}
4325

4326
static int cdrom_probe_device(const char *filename)
4327
{
4328
    int fd, ret;
4329
    int prio = 0;
4330
    struct stat st;
4331

4332
    fd = qemu_open(filename, O_RDONLY | O_NONBLOCK, NULL);
4333
    if (fd < 0) {
4334
        goto out;
4335
    }
4336
    ret = fstat(fd, &st);
4337
    if (ret == -1 || !S_ISBLK(st.st_mode)) {
4338
        goto outc;
4339
    }
4340

4341
    /* Attempt to detect via a CDROM specific ioctl */
4342
    ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
4343
    if (ret >= 0)
4344
        prio = 100;
4345

4346
outc:
4347
    qemu_close(fd);
4348
out:
4349
    return prio;
4350
}
4351

4352
static bool coroutine_fn cdrom_co_is_inserted(BlockDriverState *bs)
4353
{
4354
    BDRVRawState *s = bs->opaque;
4355
    int ret;
4356

4357
    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
4358
    return ret == CDS_DISC_OK;
4359
}
4360

4361
static void coroutine_fn cdrom_co_eject(BlockDriverState *bs, bool eject_flag)
4362
{
4363
    BDRVRawState *s = bs->opaque;
4364

4365
    if (eject_flag) {
4366
        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
4367
            perror("CDROMEJECT");
4368
    } else {
4369
        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
4370
            perror("CDROMEJECT");
4371
    }
4372
}
4373

4374
static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked)
4375
{
4376
    BDRVRawState *s = bs->opaque;
4377

4378
    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
4379
        /*
4380
         * Note: an error can happen if the distribution automatically
4381
         * mounts the CD-ROM
4382
         */
4383
        /* perror("CDROM_LOCKDOOR"); */
4384
    }
4385
}
4386

4387
static BlockDriver bdrv_host_cdrom = {
4388
    .format_name        = "host_cdrom",
4389
    .protocol_name      = "host_cdrom",
4390
    .instance_size      = sizeof(BDRVRawState),
4391
    .bdrv_needs_filename = true,
4392
    .bdrv_probe_device	= cdrom_probe_device,
4393
    .bdrv_parse_filename = cdrom_parse_filename,
4394
    .bdrv_open          = cdrom_open,
4395
    .bdrv_close         = raw_close,
4396
    .bdrv_reopen_prepare = raw_reopen_prepare,
4397
    .bdrv_reopen_commit  = raw_reopen_commit,
4398
    .bdrv_reopen_abort   = raw_reopen_abort,
4399
    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
4400
    .create_opts         = &bdrv_create_opts_simple,
4401
    .mutable_opts        = mutable_opts,
4402
    .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
4403

4404
    .bdrv_co_preadv         = raw_co_preadv,
4405
    .bdrv_co_pwritev        = raw_co_pwritev,
4406
    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
4407
    .bdrv_refresh_limits    = cdrom_refresh_limits,
4408

4409
    .bdrv_co_truncate                   = raw_co_truncate,
4410
    .bdrv_co_getlength                  = raw_co_getlength,
4411
    .bdrv_co_get_allocated_file_size    = raw_co_get_allocated_file_size,
4412

4413
    /* removable device support */
4414
    .bdrv_co_is_inserted    = cdrom_co_is_inserted,
4415
    .bdrv_co_eject          = cdrom_co_eject,
4416
    .bdrv_co_lock_medium    = cdrom_co_lock_medium,
4417

4418
    /* generic scsi device */
4419
    .bdrv_co_ioctl      = hdev_co_ioctl,
4420
};
4421
#endif /* __linux__ */
4422

4423
#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
4424
static int cdrom_open(BlockDriverState *bs, QDict *options, int flags,
4425
                      Error **errp)
4426
{
4427
    BDRVRawState *s = bs->opaque;
4428
    int ret;
4429

4430
    s->type = FTYPE_CD;
4431

4432
    ret = raw_open_common(bs, options, flags, 0, true, errp);
4433
    if (ret) {
4434
        return ret;
4435
    }
4436

4437
    /* make sure the door isn't locked at this time */
4438
    ioctl(s->fd, CDIOCALLOW);
4439
    return 0;
4440
}
4441

4442
static int cdrom_probe_device(const char *filename)
4443
{
4444
    if (strstart(filename, "/dev/cd", NULL) ||
4445
            strstart(filename, "/dev/acd", NULL))
4446
        return 100;
4447
    return 0;
4448
}
4449

4450
static int cdrom_reopen(BlockDriverState *bs)
4451
{
4452
    BDRVRawState *s = bs->opaque;
4453
    int fd;
4454

4455
    /*
4456
     * Force reread of possibly changed/newly loaded disc,
4457
     * FreeBSD seems to not notice sometimes...
4458
     */
4459
    if (s->fd >= 0)
4460
        qemu_close(s->fd);
4461
    fd = qemu_open(bs->filename, s->open_flags, NULL);
4462
    if (fd < 0) {
4463
        s->fd = -1;
4464
        return -EIO;
4465
    }
4466
    s->fd = fd;
4467

4468
    /* make sure the door isn't locked at this time */
4469
    ioctl(s->fd, CDIOCALLOW);
4470
    return 0;
4471
}
4472

4473
static bool coroutine_fn cdrom_co_is_inserted(BlockDriverState *bs)
4474
{
4475
    return raw_getlength(bs) > 0;
4476
}
4477

4478
static void coroutine_fn cdrom_co_eject(BlockDriverState *bs, bool eject_flag)
4479
{
4480
    BDRVRawState *s = bs->opaque;
4481

4482
    if (s->fd < 0)
4483
        return;
4484

4485
    (void) ioctl(s->fd, CDIOCALLOW);
4486

4487
    if (eject_flag) {
4488
        if (ioctl(s->fd, CDIOCEJECT) < 0)
4489
            perror("CDIOCEJECT");
4490
    } else {
4491
        if (ioctl(s->fd, CDIOCCLOSE) < 0)
4492
            perror("CDIOCCLOSE");
4493
    }
4494

4495
    cdrom_reopen(bs);
4496
}
4497

4498
static void coroutine_fn cdrom_co_lock_medium(BlockDriverState *bs, bool locked)
4499
{
4500
    BDRVRawState *s = bs->opaque;
4501

4502
    if (s->fd < 0)
4503
        return;
4504
    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
4505
        /*
4506
         * Note: an error can happen if the distribution automatically
4507
         * mounts the CD-ROM
4508
         */
4509
        /* perror("CDROM_LOCKDOOR"); */
4510
    }
4511
}
4512

4513
static BlockDriver bdrv_host_cdrom = {
4514
    .format_name        = "host_cdrom",
4515
    .protocol_name      = "host_cdrom",
4516
    .instance_size      = sizeof(BDRVRawState),
4517
    .bdrv_needs_filename = true,
4518
    .bdrv_probe_device	= cdrom_probe_device,
4519
    .bdrv_parse_filename = cdrom_parse_filename,
4520
    .bdrv_open          = cdrom_open,
4521
    .bdrv_close         = raw_close,
4522
    .bdrv_reopen_prepare = raw_reopen_prepare,
4523
    .bdrv_reopen_commit  = raw_reopen_commit,
4524
    .bdrv_reopen_abort   = raw_reopen_abort,
4525
    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
4526
    .create_opts         = &bdrv_create_opts_simple,
4527
    .mutable_opts       = mutable_opts,
4528

4529
    .bdrv_co_preadv         = raw_co_preadv,
4530
    .bdrv_co_pwritev        = raw_co_pwritev,
4531
    .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
4532
    .bdrv_refresh_limits    = cdrom_refresh_limits,
4533

4534
    .bdrv_co_truncate                   = raw_co_truncate,
4535
    .bdrv_co_getlength                  = raw_co_getlength,
4536
    .bdrv_co_get_allocated_file_size    = raw_co_get_allocated_file_size,
4537

4538
    /* removable device support */
4539
    .bdrv_co_is_inserted     = cdrom_co_is_inserted,
4540
    .bdrv_co_eject           = cdrom_co_eject,
4541
    .bdrv_co_lock_medium     = cdrom_co_lock_medium,
4542
};
4543
#endif /* __FreeBSD__ */
4544

4545
#endif /* HAVE_HOST_BLOCK_DEVICE */
4546

4547
static void bdrv_file_init(void)
4548
{
4549
    /*
4550
     * Register all the drivers.  Note that order is important, the driver
4551
     * registered last will get probed first.
4552
     */
4553
    bdrv_register(&bdrv_file);
4554
#if defined(HAVE_HOST_BLOCK_DEVICE)
4555
    bdrv_register(&bdrv_host_device);
4556
#ifdef __linux__
4557
    bdrv_register(&bdrv_host_cdrom);
4558
#endif
4559
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
4560
    bdrv_register(&bdrv_host_cdrom);
4561
#endif
4562
#endif /* HAVE_HOST_BLOCK_DEVICE */
4563
}
4564

4565
block_init(bdrv_file_init);
4566
qemu

Использование cookies