qemu
1/*
2* Support for RAM backed by mmaped host memory.
3*
4* Copyright (c) 2015 Red Hat, Inc.
5*
6* Authors:
7* Michael S. Tsirkin <mst@redhat.com>
8*
9* This work is licensed under the terms of the GNU GPL, version 2 or
10* later. See the COPYING file in the top-level directory.
11*/
12
13#ifdef CONFIG_LINUX14#include <linux/mman.h>15#else /* !CONFIG_LINUX */16#define MAP_SYNC 0x017#define MAP_SHARED_VALIDATE 0x018#endif /* CONFIG_LINUX */19
20#include "qemu/osdep.h"21#include "qemu/mmap-alloc.h"22#include "qemu/host-utils.h"23#include "qemu/cutils.h"24#include "qemu/error-report.h"25
26#define HUGETLBFS_MAGIC 0x958458f627
28#ifdef CONFIG_LINUX29#include <sys/vfs.h>30#include <linux/magic.h>31#endif32
33QemuFsType qemu_fd_getfs(int fd)34{
35#ifdef CONFIG_LINUX36struct statfs fs;37int ret;38
39if (fd < 0) {40return QEMU_FS_TYPE_UNKNOWN;41}42
43do {44ret = fstatfs(fd, &fs);45} while (ret != 0 && errno == EINTR);46
47switch (fs.f_type) {48case TMPFS_MAGIC:49return QEMU_FS_TYPE_TMPFS;50case HUGETLBFS_MAGIC:51return QEMU_FS_TYPE_HUGETLBFS;52default:53return QEMU_FS_TYPE_UNKNOWN;54}55#else56return QEMU_FS_TYPE_UNKNOWN;57#endif58}
59
60size_t qemu_fd_getpagesize(int fd)61{
62#ifdef CONFIG_LINUX63struct statfs fs;64int ret;65
66if (fd != -1) {67do {68ret = fstatfs(fd, &fs);69} while (ret != 0 && errno == EINTR);70
71if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) {72return fs.f_bsize;73}74}75#ifdef __sparc__76/* SPARC Linux needs greater alignment than the pagesize */77return QEMU_VMALLOC_ALIGN;78#endif79#endif80
81return qemu_real_host_page_size();82}
83
84#define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"85static bool map_noreserve_effective(int fd, uint32_t qemu_map_flags)86{
87#if defined(__linux__)88const bool readonly = qemu_map_flags & QEMU_MAP_READONLY;89const bool shared = qemu_map_flags & QEMU_MAP_SHARED;90gchar *content = NULL;91const char *endptr;92unsigned int tmp;93
94/*95* hugeltb accounting is different than ordinary swap reservation:
96* a) Hugetlb pages from the pool are reserved for both private and
97* shared mappings. For shared mappings, all mappers have to specify
98* MAP_NORESERVE.
99* b) MAP_NORESERVE is not affected by /proc/sys/vm/overcommit_memory.
100*/
101if (qemu_fd_getpagesize(fd) != qemu_real_host_page_size()) {102return true;103}104
105/*106* Accountable mappings in the kernel that can be affected by MAP_NORESEVE
107* are private writable mappings (see mm/mmap.c:accountable_mapping() in
108* Linux). For all shared or readonly mappings, MAP_NORESERVE is always
109* implicitly active -- no reservation; this includes shmem. The only
110* exception is shared anonymous memory, it is accounted like private
111* anonymous memory.
112*/
113if (readonly || (shared && fd >= 0)) {114return true;115}116
117/*118* MAP_NORESERVE is globally ignored for applicable !hugetlb mappings when
119* memory overcommit is set to "never". Sparse memory regions aren't really
120* possible in this system configuration.
121*
122* Bail out now instead of silently committing way more memory than
123* currently desired by the user.
124*/
125if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH, &content, NULL, NULL) &&126!qemu_strtoui(content, &endptr, 0, &tmp) &&127(!endptr || *endptr == '\n')) {128if (tmp == 2) {129error_report("Skipping reservation of swap space is not supported:"130" \"" OVERCOMMIT_MEMORY_PATH "\" is \"2\"");131return false;132}133return true;134}135/* this interface has been around since Linux 2.6 */136error_report("Skipping reservation of swap space is not supported:"137" Could not read: \"" OVERCOMMIT_MEMORY_PATH "\"");138return false;139#endif140/*141* E.g., FreeBSD used to define MAP_NORESERVE, never implemented it,
142* and removed it a while ago.
143*/
144error_report("Skipping reservation of swap space is not supported");145return false;146}
147
148/*
149* Reserve a new memory region of the requested size to be used for mapping
150* from the given fd (if any).
151*/
152static void *mmap_reserve(size_t size, int fd)153{
154int flags = MAP_PRIVATE;155
156#if defined(__powerpc64__) && defined(__linux__)157/*158* On ppc64 mappings in the same segment (aka slice) must share the same
159* page size. Since we will be re-allocating part of this segment
160* from the supplied fd, we should make sure to use the same page size, to
161* this end we mmap the supplied fd. In this case, set MAP_NORESERVE to
162* avoid allocating backing store memory.
163* We do this unless we are using the system page size, in which case
164* anonymous memory is OK.
165*/
166if (fd == -1 || qemu_fd_getpagesize(fd) == qemu_real_host_page_size()) {167fd = -1;168flags |= MAP_ANONYMOUS;169} else {170flags |= MAP_NORESERVE;171}172#else173fd = -1;174flags |= MAP_ANONYMOUS;175#endif176
177return mmap(0, size, PROT_NONE, flags, fd, 0);178}
179
180/*
181* Activate memory in a reserved region from the given fd (if any), to make
182* it accessible.
183*/
184static void *mmap_activate(void *ptr, size_t size, int fd,185uint32_t qemu_map_flags, off_t map_offset)186{
187const bool noreserve = qemu_map_flags & QEMU_MAP_NORESERVE;188const bool readonly = qemu_map_flags & QEMU_MAP_READONLY;189const bool shared = qemu_map_flags & QEMU_MAP_SHARED;190const bool sync = qemu_map_flags & QEMU_MAP_SYNC;191const int prot = PROT_READ | (readonly ? 0 : PROT_WRITE);192int map_sync_flags = 0;193int flags = MAP_FIXED;194void *activated_ptr;195
196if (noreserve && !map_noreserve_effective(fd, qemu_map_flags)) {197return MAP_FAILED;198}199
200flags |= fd == -1 ? MAP_ANONYMOUS : 0;201flags |= shared ? MAP_SHARED : MAP_PRIVATE;202flags |= noreserve ? MAP_NORESERVE : 0;203if (shared && sync) {204map_sync_flags = MAP_SYNC | MAP_SHARED_VALIDATE;205}206
207activated_ptr = mmap(ptr, size, prot, flags | map_sync_flags, fd,208map_offset);209if (activated_ptr == MAP_FAILED && map_sync_flags) {210if (errno == ENOTSUP) {211char *proc_link = g_strdup_printf("/proc/self/fd/%d", fd);212char *file_name = g_malloc0(PATH_MAX);213int len = readlink(proc_link, file_name, PATH_MAX - 1);214
215if (len < 0) {216len = 0;217}218file_name[len] = '\0';219fprintf(stderr, "Warning: requesting persistence across crashes "220"for backend file %s failed. Proceeding without "221"persistence, data might become corrupted in case of host "222"crash.\n", file_name);223g_free(proc_link);224g_free(file_name);225warn_report("Using non DAX backing file with 'pmem=on' option"226" is deprecated");227}228/*229* If mmap failed with MAP_SHARED_VALIDATE | MAP_SYNC, we will try
230* again without these flags to handle backwards compatibility.
231*/
232activated_ptr = mmap(ptr, size, prot, flags, fd, map_offset);233}234return activated_ptr;235}
236
237static inline size_t mmap_guard_pagesize(int fd)238{
239#if defined(__powerpc64__) && defined(__linux__)240/* Mappings in the same segment must share the same page size */241return qemu_fd_getpagesize(fd);242#else243return qemu_real_host_page_size();244#endif245}
246
247void *qemu_ram_mmap(int fd,248size_t size,249size_t align,250uint32_t qemu_map_flags,251off_t map_offset)252{
253const size_t guard_pagesize = mmap_guard_pagesize(fd);254size_t offset, total;255void *ptr, *guardptr;256
257/*258* Note: this always allocates at least one extra page of virtual address
259* space, even if size is already aligned.
260*/
261total = size + align;262
263guardptr = mmap_reserve(total, fd);264if (guardptr == MAP_FAILED) {265return MAP_FAILED;266}267
268assert(is_power_of_2(align));269/* Always align to host page size */270assert(align >= guard_pagesize);271
272offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;273
274ptr = mmap_activate(guardptr + offset, size, fd, qemu_map_flags,275map_offset);276if (ptr == MAP_FAILED) {277munmap(guardptr, total);278return MAP_FAILED;279}280
281if (offset > 0) {282munmap(guardptr, offset);283}284
285/*286* Leave a single PROT_NONE page allocated after the RAM block, to serve as
287* a guard page guarding against potential buffer overflows.
288*/
289total -= offset;290if (total > size + guard_pagesize) {291munmap(ptr + size + guard_pagesize, total - size - guard_pagesize);292}293
294return ptr;295}
296
297void qemu_ram_munmap(int fd, void *ptr, size_t size)298{
299if (ptr) {300/* Unmap both the RAM block and the guard page */301munmap(ptr, size + mmap_guard_pagesize(fd));302}303}
304