2
* DMA memory preregistration
5
* Alexey Kardashevskiy <aik@ozlabs.ru>
7
* This work is licensed under the terms of the GNU GPL, version 2. See
8
* the COPYING file in the top-level directory.
11
#include "qemu/osdep.h"
13
#include <linux/vfio.h>
17
#include "sysemu/kvm.h"
18
#include "exec/address-spaces.h"
20
#include "hw/vfio/vfio-common.h"
22
#include "exec/ram_addr.h"
23
#include "qemu/error-report.h"
24
#include "qapi/error.h"
27
typedef struct VFIOSpaprContainer {
28
VFIOContainer container;
29
MemoryListener prereg_listener;
30
QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
33
OBJECT_DECLARE_SIMPLE_TYPE(VFIOSpaprContainer, VFIO_IOMMU_SPAPR);
35
static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
37
if (memory_region_is_iommu(section->mr)) {
38
hw_error("Cannot possibly preregister IOMMU memory");
41
return !memory_region_is_ram(section->mr) ||
42
memory_region_is_ram_device(section->mr);
45
static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
47
return memory_region_get_ram_ptr(section->mr) +
48
section->offset_within_region +
49
(gpa - section->offset_within_address_space);
52
static void vfio_prereg_listener_region_add(MemoryListener *listener,
53
MemoryRegionSection *section)
55
VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
57
VFIOContainer *container = &scontainer->container;
58
VFIOContainerBase *bcontainer = &container->bcontainer;
59
const hwaddr gpa = section->offset_within_address_space;
62
hwaddr page_mask = qemu_real_host_page_mask();
63
struct vfio_iommu_spapr_register_memory reg = {
68
if (vfio_prereg_listener_skipped_section(section)) {
69
trace_vfio_prereg_listener_region_add_skip(
70
section->offset_within_address_space,
71
section->offset_within_address_space +
72
int128_get64(int128_sub(section->size, int128_one())));
76
if (unlikely((section->offset_within_address_space & ~page_mask) ||
77
(section->offset_within_region & ~page_mask) ||
78
(int128_get64(section->size) & ~page_mask))) {
79
error_report("%s received unaligned region", __func__);
83
end = section->offset_within_address_space + int128_get64(section->size);
88
memory_region_ref(section->mr);
90
reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
93
ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
94
trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
97
* On the initfn path, store the first error in the container so we
98
* can gracefully fail. Runtime, there's not much we can do other
99
* than throw a hardware error.
101
if (!bcontainer->initialized) {
102
if (!bcontainer->error) {
103
error_setg_errno(&bcontainer->error, -ret,
104
"Memory registering failed");
107
hw_error("vfio: Memory registering failed, unable to continue");
112
static void vfio_prereg_listener_region_del(MemoryListener *listener,
113
MemoryRegionSection *section)
115
VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
117
VFIOContainer *container = &scontainer->container;
118
const hwaddr gpa = section->offset_within_address_space;
121
hwaddr page_mask = qemu_real_host_page_mask();
122
struct vfio_iommu_spapr_register_memory reg = {
123
.argsz = sizeof(reg),
127
if (vfio_prereg_listener_skipped_section(section)) {
128
trace_vfio_prereg_listener_region_del_skip(
129
section->offset_within_address_space,
130
section->offset_within_address_space +
131
int128_get64(int128_sub(section->size, int128_one())));
135
if (unlikely((section->offset_within_address_space & ~page_mask) ||
136
(section->offset_within_region & ~page_mask) ||
137
(int128_get64(section->size) & ~page_mask))) {
138
error_report("%s received unaligned region", __func__);
142
end = section->offset_within_address_space + int128_get64(section->size);
147
reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
148
reg.size = end - gpa;
150
ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®);
151
trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
154
static const MemoryListener vfio_prereg_listener = {
155
.name = "vfio-pre-reg",
156
.region_add = vfio_prereg_listener_region_add,
157
.region_del = vfio_prereg_listener_region_del,
160
static void vfio_host_win_add(VFIOSpaprContainer *scontainer, hwaddr min_iova,
161
hwaddr max_iova, uint64_t iova_pgsizes)
163
VFIOHostDMAWindow *hostwin;
165
QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
166
if (ranges_overlap(hostwin->min_iova,
167
hostwin->max_iova - hostwin->min_iova + 1,
169
max_iova - min_iova + 1)) {
170
hw_error("%s: Overlapped IOMMU are not enabled", __func__);
174
hostwin = g_malloc0(sizeof(*hostwin));
176
hostwin->min_iova = min_iova;
177
hostwin->max_iova = max_iova;
178
hostwin->iova_pgsizes = iova_pgsizes;
179
QLIST_INSERT_HEAD(&scontainer->hostwin_list, hostwin, hostwin_next);
182
static int vfio_host_win_del(VFIOSpaprContainer *scontainer,
183
hwaddr min_iova, hwaddr max_iova)
185
VFIOHostDMAWindow *hostwin;
187
QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
188
if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
189
QLIST_REMOVE(hostwin, hostwin_next);
198
static VFIOHostDMAWindow *vfio_find_hostwin(VFIOSpaprContainer *container,
199
hwaddr iova, hwaddr end)
201
VFIOHostDMAWindow *hostwin;
202
bool hostwin_found = false;
204
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
205
if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
206
hostwin_found = true;
211
return hostwin_found ? hostwin : NULL;
214
static int vfio_spapr_remove_window(VFIOContainer *container,
215
hwaddr offset_within_address_space)
217
struct vfio_iommu_spapr_tce_remove remove = {
218
.argsz = sizeof(remove),
219
.start_addr = offset_within_address_space,
223
ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
225
error_report("Failed to remove window at %"PRIx64,
226
(uint64_t)remove.start_addr);
230
trace_vfio_spapr_remove_window(offset_within_address_space);
235
static int vfio_spapr_create_window(VFIOContainer *container,
236
MemoryRegionSection *section,
240
VFIOContainerBase *bcontainer = &container->bcontainer;
241
IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
242
uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
243
unsigned entries, bits_total, bits_per_level, max_levels;
244
struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
245
long rampagesize = qemu_minrampagesize();
248
* The host might not support the guest supported IOMMU page size,
249
* so we will use smaller physical IOMMU pages to back them.
251
if (pagesize > rampagesize) {
252
pagesize = rampagesize;
254
pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1));
255
pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0;
257
error_report("Host doesn't support page size 0x%"PRIx64
258
", the supported mask is 0x%lx",
259
memory_region_iommu_get_min_page_size(iommu_mr),
260
bcontainer->pgsizes);
265
* FIXME: For VFIO iommu types which have KVM acceleration to
266
* avoid bouncing all map/unmaps through qemu this way, this
267
* would be the right place to wire that up (tell the KVM
268
* device emulation the VFIO iommu handles to use).
270
create.window_size = int128_get64(section->size);
271
create.page_shift = ctz64(pagesize);
273
* SPAPR host supports multilevel TCE tables. We try to guess optimal
274
* levels number and if this fails (for example due to the host memory
275
* fragmentation), we increase levels. The DMA address structure is:
276
* rrrrrrrr rxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx iiiiiiii
278
* r = reserved (bits >= 55 are reserved in the existing hardware)
279
* i = IOMMU page offset (64K in this example)
280
* x = bits to index a TCE which can be split to equal chunks to index
282
* The aim is to split "x" to smaller possible number of levels.
284
entries = create.window_size >> create.page_shift;
285
/* bits_total is number of "x" needed */
286
bits_total = ctz64(entries * sizeof(uint64_t));
288
* bits_per_level is a safe guess of how much we can allocate per level:
289
* 8 is the current minimum for CONFIG_FORCE_MAX_ZONEORDER and MAX_ORDER
290
* is usually bigger than that.
291
* Below we look at qemu_real_host_page_size as TCEs are allocated from
294
bits_per_level = ctz64(qemu_real_host_page_size()) + 8;
295
create.levels = bits_total / bits_per_level;
296
if (bits_total % bits_per_level) {
299
max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
300
for ( ; create.levels <= max_levels; ++create.levels) {
301
ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
307
error_report("Failed to create a window, ret = %d (%m)", ret);
311
if (create.start_addr != section->offset_within_address_space) {
312
vfio_spapr_remove_window(container, create.start_addr);
314
error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
315
section->offset_within_address_space,
316
(uint64_t)create.start_addr);
319
trace_vfio_spapr_create_window(create.page_shift,
329
vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
330
MemoryRegionSection *section,
333
VFIOContainer *container = container_of(bcontainer, VFIOContainer,
335
VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
337
VFIOHostDMAWindow *hostwin;
342
* VFIO_SPAPR_TCE_IOMMU supports a single host window between
343
* [dma32_window_start, dma32_window_size), we need to ensure
344
* the section fall in this range.
346
if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
349
iova = section->offset_within_address_space;
350
end = iova + int128_get64(section->size) - 1;
352
if (!vfio_find_hostwin(scontainer, iova, end)) {
353
error_setg(errp, "Container %p can't map guest IOVA region"
354
" 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
361
if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
365
/* For now intersections are not allowed, we may relax this later */
366
QLIST_FOREACH(hostwin, &scontainer->hostwin_list, hostwin_next) {
367
if (ranges_overlap(hostwin->min_iova,
368
hostwin->max_iova - hostwin->min_iova + 1,
369
section->offset_within_address_space,
370
int128_get64(section->size))) {
372
"region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
373
"host DMA window [0x%"PRIx64",0x%"PRIx64"]",
374
section->offset_within_address_space,
375
section->offset_within_address_space +
376
int128_get64(section->size) - 1,
377
hostwin->min_iova, hostwin->max_iova);
382
ret = vfio_spapr_create_window(container, section, &pgsize);
384
error_setg_errno(errp, -ret, "Failed to create SPAPR window");
388
vfio_host_win_add(scontainer, section->offset_within_address_space,
389
section->offset_within_address_space +
390
int128_get64(section->size) - 1, pgsize);
394
IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
395
struct kvm_vfio_spapr_tce param;
396
struct kvm_device_attr attr = {
397
.group = KVM_DEV_VFIO_GROUP,
398
.attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
399
.addr = (uint64_t)(unsigned long)¶m,
402
if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
404
QLIST_FOREACH(group, &container->group_list, container_next) {
405
param.groupfd = group->fd;
406
if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
407
error_setg_errno(errp, errno,
408
"vfio: failed GROUP_SET_SPAPR_TCE for "
409
"KVM VFIO device %d and group fd %d",
410
param.tablefd, param.groupfd);
413
trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
422
vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer,
423
MemoryRegionSection *section)
425
VFIOContainer *container = container_of(bcontainer, VFIOContainer,
427
VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
430
if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
434
vfio_spapr_remove_window(container,
435
section->offset_within_address_space);
436
if (vfio_host_win_del(scontainer,
437
section->offset_within_address_space,
438
section->offset_within_address_space +
439
int128_get64(section->size) - 1) < 0) {
440
hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
441
__func__, section->offset_within_address_space);
445
static void vfio_spapr_container_release(VFIOContainerBase *bcontainer)
447
VFIOContainer *container = container_of(bcontainer, VFIOContainer,
449
VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
451
VFIOHostDMAWindow *hostwin, *next;
453
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
454
memory_listener_unregister(&scontainer->prereg_listener);
456
QLIST_FOREACH_SAFE(hostwin, &scontainer->hostwin_list, hostwin_next,
458
QLIST_REMOVE(hostwin, hostwin_next);
463
static bool vfio_spapr_container_setup(VFIOContainerBase *bcontainer,
466
VFIOContainer *container = container_of(bcontainer, VFIOContainer,
468
VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
470
struct vfio_iommu_spapr_tce_info info;
471
bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
472
int ret, fd = container->fd;
474
QLIST_INIT(&scontainer->hostwin_list);
477
* The host kernel code implementing VFIO_IOMMU_DISABLE is called
478
* when container fd is closed so we do not call it explicitly
482
ret = ioctl(fd, VFIO_IOMMU_ENABLE);
484
error_setg_errno(errp, errno, "failed to enable container");
488
scontainer->prereg_listener = vfio_prereg_listener;
490
memory_listener_register(&scontainer->prereg_listener,
491
&address_space_memory);
492
if (bcontainer->error) {
493
error_propagate_prepend(errp, bcontainer->error,
494
"RAM memory listener initialization failed: ");
495
goto listener_unregister_exit;
499
info.argsz = sizeof(info);
500
ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
502
error_setg_errno(errp, errno,
503
"VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
504
goto listener_unregister_exit;
508
bcontainer->pgsizes = info.ddw.pgsizes;
510
* There is a default window in just created container.
511
* To make region_add/del simpler, we better remove this
512
* window now and let those iommu_listener callbacks
513
* create/remove them when needed.
515
ret = vfio_spapr_remove_window(container, info.dma32_window_start);
517
error_setg_errno(errp, -ret,
518
"failed to remove existing window");
519
goto listener_unregister_exit;
522
/* The default table uses 4K pages */
523
bcontainer->pgsizes = 0x1000;
524
vfio_host_win_add(scontainer, info.dma32_window_start,
525
info.dma32_window_start +
526
info.dma32_window_size - 1,
532
listener_unregister_exit:
534
memory_listener_unregister(&scontainer->prereg_listener);
539
static void vfio_iommu_spapr_class_init(ObjectClass *klass, void *data)
541
VFIOIOMMUClass *vioc = VFIO_IOMMU_CLASS(klass);
543
vioc->add_window = vfio_spapr_container_add_section_window;
544
vioc->del_window = vfio_spapr_container_del_section_window;
545
vioc->release = vfio_spapr_container_release;
546
vioc->setup = vfio_spapr_container_setup;
549
static const TypeInfo types[] = {
551
.name = TYPE_VFIO_IOMMU_SPAPR,
552
.parent = TYPE_VFIO_IOMMU_LEGACY,
553
.instance_size = sizeof(VFIOSpaprContainer),
554
.class_init = vfio_iommu_spapr_class_init,