qemu
1485 строк · 45.3 Кб
1/*
2* mmap support for qemu
3*
4* Copyright (c) 2003 Fabrice Bellard
5*
6* This program is free software; you can redistribute it and/or modify
7* it under the terms of the GNU General Public License as published by
8* the Free Software Foundation; either version 2 of the License, or
9* (at your option) any later version.
10*
11* This program is distributed in the hope that it will be useful,
12* but WITHOUT ANY WARRANTY; without even the implied warranty of
13* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14* GNU General Public License for more details.
15*
16* You should have received a copy of the GNU General Public License
17* along with this program; if not, see <http://www.gnu.org/licenses/>.
18*/
19#include "qemu/osdep.h"20#include <sys/shm.h>21#include "trace.h"22#include "exec/log.h"23#include "exec/page-protection.h"24#include "qemu.h"25#include "user-internals.h"26#include "user-mmap.h"27#include "target_mman.h"28#include "qemu/interval-tree.h"29
30#ifdef TARGET_ARM31#include "target/arm/cpu-features.h"32#endif33
34static pthread_mutex_t mmap_mutex = PTHREAD_MUTEX_INITIALIZER;35static __thread int mmap_lock_count;36
37void mmap_lock(void)38{
39if (mmap_lock_count++ == 0) {40pthread_mutex_lock(&mmap_mutex);41}42}
43
44void mmap_unlock(void)45{
46assert(mmap_lock_count > 0);47if (--mmap_lock_count == 0) {48pthread_mutex_unlock(&mmap_mutex);49}50}
51
52bool have_mmap_lock(void)53{
54return mmap_lock_count > 0 ? true : false;55}
56
57/* Grab lock to make sure things are in a consistent state after fork(). */
58void mmap_fork_start(void)59{
60if (mmap_lock_count)61abort();62pthread_mutex_lock(&mmap_mutex);63}
64
65void mmap_fork_end(int child)66{
67if (child) {68pthread_mutex_init(&mmap_mutex, NULL);69} else {70pthread_mutex_unlock(&mmap_mutex);71}72}
73
74/* Protected by mmap_lock. */
75static IntervalTreeRoot shm_regions;76
77static void shm_region_add(abi_ptr start, abi_ptr last)78{
79IntervalTreeNode *i = g_new0(IntervalTreeNode, 1);80
81i->start = start;82i->last = last;83interval_tree_insert(i, &shm_regions);84}
85
86static abi_ptr shm_region_find(abi_ptr start)87{
88IntervalTreeNode *i;89
90for (i = interval_tree_iter_first(&shm_regions, start, start); i;91i = interval_tree_iter_next(i, start, start)) {92if (i->start == start) {93return i->last;94}95}96return 0;97}
98
99static void shm_region_rm_complete(abi_ptr start, abi_ptr last)100{
101IntervalTreeNode *i, *n;102
103for (i = interval_tree_iter_first(&shm_regions, start, last); i; i = n) {104n = interval_tree_iter_next(i, start, last);105if (i->start >= start && i->last <= last) {106interval_tree_remove(i, &shm_regions);107g_free(i);108}109}110}
111
112/*
113* Validate target prot bitmask.
114* Return the prot bitmask for the host in *HOST_PROT.
115* Return 0 if the target prot bitmask is invalid, otherwise
116* the internal qemu page_flags (which will include PAGE_VALID).
117*/
118static int validate_prot_to_pageflags(int prot)119{
120int valid = PROT_READ | PROT_WRITE | PROT_EXEC | TARGET_PROT_SEM;121int page_flags = (prot & PAGE_RWX) | PAGE_VALID;122
123#ifdef TARGET_AARCH64124{125ARMCPU *cpu = ARM_CPU(thread_cpu);126
127/*128* The PROT_BTI bit is only accepted if the cpu supports the feature.
129* Since this is the unusual case, don't bother checking unless
130* the bit has been requested. If set and valid, record the bit
131* within QEMU's page_flags.
132*/
133if ((prot & TARGET_PROT_BTI) && cpu_isar_feature(aa64_bti, cpu)) {134valid |= TARGET_PROT_BTI;135page_flags |= PAGE_BTI;136}137/* Similarly for the PROT_MTE bit. */138if ((prot & TARGET_PROT_MTE) && cpu_isar_feature(aa64_mte, cpu)) {139valid |= TARGET_PROT_MTE;140page_flags |= PAGE_MTE;141}142}143#elif defined(TARGET_HPPA)144valid |= PROT_GROWSDOWN | PROT_GROWSUP;145#endif146
147return prot & ~valid ? 0 : page_flags;148}
149
150/*
151* For the host, we need not pass anything except read/write/exec.
152* While PROT_SEM is allowed by all hosts, it is also ignored, so
153* don't bother transforming guest bit to host bit. Any other
154* target-specific prot bits will not be understood by the host
155* and will need to be encoded into page_flags for qemu emulation.
156*
157* Pages that are executable by the guest will never be executed
158* by the host, but the host will need to be able to read them.
159*/
160static int target_to_host_prot(int prot)161{
162return (prot & (PROT_READ | PROT_WRITE)) |163(prot & PROT_EXEC ? PROT_READ : 0);164}
165
166/* NOTE: all the constants are the HOST ones, but addresses are target. */
167int target_mprotect(abi_ulong start, abi_ulong len, int target_prot)168{
169int host_page_size = qemu_real_host_page_size();170abi_ulong starts[3];171abi_ulong lens[3];172int prots[3];173abi_ulong host_start, host_last, last;174int prot1, ret, page_flags, nranges;175
176trace_target_mprotect(start, len, target_prot);177
178if ((start & ~TARGET_PAGE_MASK) != 0) {179return -TARGET_EINVAL;180}181page_flags = validate_prot_to_pageflags(target_prot);182if (!page_flags) {183return -TARGET_EINVAL;184}185if (len == 0) {186return 0;187}188len = TARGET_PAGE_ALIGN(len);189if (!guest_range_valid_untagged(start, len)) {190return -TARGET_ENOMEM;191}192
193last = start + len - 1;194host_start = start & -host_page_size;195host_last = ROUND_UP(last, host_page_size) - 1;196nranges = 0;197
198mmap_lock();199
200if (host_last - host_start < host_page_size) {201/* Single host page contains all guest pages: sum the prot. */202prot1 = target_prot;203for (abi_ulong a = host_start; a < start; a += TARGET_PAGE_SIZE) {204prot1 |= page_get_flags(a);205}206for (abi_ulong a = last; a < host_last; a += TARGET_PAGE_SIZE) {207prot1 |= page_get_flags(a + 1);208}209starts[nranges] = host_start;210lens[nranges] = host_page_size;211prots[nranges] = prot1;212nranges++;213} else {214if (host_start < start) {215/* Host page contains more than one guest page: sum the prot. */216prot1 = target_prot;217for (abi_ulong a = host_start; a < start; a += TARGET_PAGE_SIZE) {218prot1 |= page_get_flags(a);219}220/* If the resulting sum differs, create a new range. */221if (prot1 != target_prot) {222starts[nranges] = host_start;223lens[nranges] = host_page_size;224prots[nranges] = prot1;225nranges++;226host_start += host_page_size;227}228}229
230if (last < host_last) {231/* Host page contains more than one guest page: sum the prot. */232prot1 = target_prot;233for (abi_ulong a = last; a < host_last; a += TARGET_PAGE_SIZE) {234prot1 |= page_get_flags(a + 1);235}236/* If the resulting sum differs, create a new range. */237if (prot1 != target_prot) {238host_last -= host_page_size;239starts[nranges] = host_last + 1;240lens[nranges] = host_page_size;241prots[nranges] = prot1;242nranges++;243}244}245
246/* Create a range for the middle, if any remains. */247if (host_start < host_last) {248starts[nranges] = host_start;249lens[nranges] = host_last - host_start + 1;250prots[nranges] = target_prot;251nranges++;252}253}254
255for (int i = 0; i < nranges; ++i) {256ret = mprotect(g2h_untagged(starts[i]), lens[i],257target_to_host_prot(prots[i]));258if (ret != 0) {259goto error;260}261}262
263page_set_flags(start, last, page_flags);264ret = 0;265
266error:267mmap_unlock();268return ret;269}
270
271/*
272* Perform munmap on behalf of the target, with host parameters.
273* If reserved_va, we must replace the memory reservation.
274*/
275static int do_munmap(void *addr, size_t len)276{
277if (reserved_va) {278void *ptr = mmap(addr, len, PROT_NONE,279MAP_FIXED | MAP_ANONYMOUS280| MAP_PRIVATE | MAP_NORESERVE, -1, 0);281return ptr == addr ? 0 : -1;282}283return munmap(addr, len);284}
285
286/*
287* Perform a pread on behalf of target_mmap. We can reach EOF, we can be
288* interrupted by signals, and in general there's no good error return path.
289* If @zero, zero the rest of the block at EOF.
290* Return true on success.
291*/
292static bool mmap_pread(int fd, void *p, size_t len, off_t offset, bool zero)293{
294while (1) {295ssize_t r = pread(fd, p, len, offset);296
297if (likely(r == len)) {298/* Complete */299return true;300}301if (r == 0) {302/* EOF */303if (zero) {304memset(p, 0, len);305}306return true;307}308if (r > 0) {309/* Short read */310p += r;311len -= r;312offset += r;313} else if (errno != EINTR) {314/* Error */315return false;316}317}318}
319
320/*
321* Map an incomplete host page.
322*
323* Here be dragons. This case will not work if there is an existing
324* overlapping host page, which is file mapped, and for which the mapping
325* is beyond the end of the file. In that case, we will see SIGBUS when
326* trying to write a portion of this page.
327*
328* FIXME: Work around this with a temporary signal handler and longjmp.
329*/
330static bool mmap_frag(abi_ulong real_start, abi_ulong start, abi_ulong last,331int prot, int flags, int fd, off_t offset)332{
333int host_page_size = qemu_real_host_page_size();334abi_ulong real_last;335void *host_start;336int prot_old, prot_new;337int host_prot_old, host_prot_new;338
339if (!(flags & MAP_ANONYMOUS)340&& (flags & MAP_TYPE) == MAP_SHARED341&& (prot & PROT_WRITE)) {342/*343* msync() won't work with the partial page, so we return an
344* error if write is possible while it is a shared mapping.
345*/
346errno = EINVAL;347return false;348}349
350real_last = real_start + host_page_size - 1;351host_start = g2h_untagged(real_start);352
353/* Get the protection of the target pages outside the mapping. */354prot_old = 0;355for (abi_ulong a = real_start; a < start; a += TARGET_PAGE_SIZE) {356prot_old |= page_get_flags(a);357}358for (abi_ulong a = real_last; a > last; a -= TARGET_PAGE_SIZE) {359prot_old |= page_get_flags(a);360}361
362if (prot_old == 0) {363/*364* Since !(prot_old & PAGE_VALID), there were no guest pages
365* outside of the fragment we need to map. Allocate a new host
366* page to cover, discarding whatever else may have been present.
367*/
368void *p = mmap(host_start, host_page_size,369target_to_host_prot(prot),370flags | MAP_ANONYMOUS, -1, 0);371if (p != host_start) {372if (p != MAP_FAILED) {373do_munmap(p, host_page_size);374errno = EEXIST;375}376return false;377}378prot_old = prot;379}380prot_new = prot | prot_old;381
382host_prot_old = target_to_host_prot(prot_old);383host_prot_new = target_to_host_prot(prot_new);384
385/* Adjust protection to be able to write. */386if (!(host_prot_old & PROT_WRITE)) {387host_prot_old |= PROT_WRITE;388mprotect(host_start, host_page_size, host_prot_old);389}390
391/* Read or zero the new guest pages. */392if (flags & MAP_ANONYMOUS) {393memset(g2h_untagged(start), 0, last - start + 1);394} else if (!mmap_pread(fd, g2h_untagged(start), last - start + 1,395offset, true)) {396return false;397}398
399/* Put final protection */400if (host_prot_new != host_prot_old) {401mprotect(host_start, host_page_size, host_prot_new);402}403return true;404}
405
406abi_ulong task_unmapped_base;407abi_ulong elf_et_dyn_base;408abi_ulong mmap_next_start;409
410/*
411* Subroutine of mmap_find_vma, used when we have pre-allocated
412* a chunk of guest address space.
413*/
414static abi_ulong mmap_find_vma_reserved(abi_ulong start, abi_ulong size,415abi_ulong align)416{
417target_ulong ret;418
419ret = page_find_range_empty(start, reserved_va, size, align);420if (ret == -1 && start > mmap_min_addr) {421/* Restart at the beginning of the address space. */422ret = page_find_range_empty(mmap_min_addr, start - 1, size, align);423}424
425return ret;426}
427
428/*
429* Find and reserve a free memory area of size 'size'. The search
430* starts at 'start'.
431* It must be called with mmap_lock() held.
432* Return -1 if error.
433*/
434abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size, abi_ulong align)435{
436int host_page_size = qemu_real_host_page_size();437void *ptr, *prev;438abi_ulong addr;439int wrapped, repeat;440
441align = MAX(align, host_page_size);442
443/* If 'start' == 0, then a default start address is used. */444if (start == 0) {445start = mmap_next_start;446} else {447start &= -host_page_size;448}449start = ROUND_UP(start, align);450size = ROUND_UP(size, host_page_size);451
452if (reserved_va) {453return mmap_find_vma_reserved(start, size, align);454}455
456addr = start;457wrapped = repeat = 0;458prev = 0;459
460for (;; prev = ptr) {461/*462* Reserve needed memory area to avoid a race.
463* It should be discarded using:
464* - mmap() with MAP_FIXED flag
465* - mremap() with MREMAP_FIXED flag
466* - shmat() with SHM_REMAP flag
467*/
468ptr = mmap(g2h_untagged(addr), size, PROT_NONE,469MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, -1, 0);470
471/* ENOMEM, if host address space has no memory */472if (ptr == MAP_FAILED) {473return (abi_ulong)-1;474}475
476/*477* Count the number of sequential returns of the same address.
478* This is used to modify the search algorithm below.
479*/
480repeat = (ptr == prev ? repeat + 1 : 0);481
482if (h2g_valid(ptr + size - 1)) {483addr = h2g(ptr);484
485if ((addr & (align - 1)) == 0) {486/* Success. */487if (start == mmap_next_start && addr >= task_unmapped_base) {488mmap_next_start = addr + size;489}490return addr;491}492
493/* The address is not properly aligned for the target. */494switch (repeat) {495case 0:496/*497* Assume the result that the kernel gave us is the
498* first with enough free space, so start again at the
499* next higher target page.
500*/
501addr = ROUND_UP(addr, align);502break;503case 1:504/*505* Sometimes the kernel decides to perform the allocation
506* at the top end of memory instead.
507*/
508addr &= -align;509break;510case 2:511/* Start over at low memory. */512addr = 0;513break;514default:515/* Fail. This unaligned block must the last. */516addr = -1;517break;518}519} else {520/*521* Since the result the kernel gave didn't fit, start
522* again at low memory. If any repetition, fail.
523*/
524addr = (repeat ? -1 : 0);525}526
527/* Unmap and try again. */528munmap(ptr, size);529
530/* ENOMEM if we checked the whole of the target address space. */531if (addr == (abi_ulong)-1) {532return (abi_ulong)-1;533} else if (addr == 0) {534if (wrapped) {535return (abi_ulong)-1;536}537wrapped = 1;538/*539* Don't actually use 0 when wrapping, instead indicate
540* that we'd truly like an allocation in low memory.
541*/
542addr = (mmap_min_addr > TARGET_PAGE_SIZE543? TARGET_PAGE_ALIGN(mmap_min_addr)544: TARGET_PAGE_SIZE);545} else if (wrapped && addr >= start) {546return (abi_ulong)-1;547}548}549}
550
551/*
552* Record a successful mmap within the user-exec interval tree.
553*/
554static abi_long mmap_end(abi_ulong start, abi_ulong last,555abi_ulong passthrough_start,556abi_ulong passthrough_last,557int flags, int page_flags)558{
559if (flags & MAP_ANONYMOUS) {560page_flags |= PAGE_ANON;561}562page_flags |= PAGE_RESET;563if (passthrough_start > passthrough_last) {564page_set_flags(start, last, page_flags);565} else {566if (start < passthrough_start) {567page_set_flags(start, passthrough_start - 1, page_flags);568}569page_set_flags(passthrough_start, passthrough_last,570page_flags | PAGE_PASSTHROUGH);571if (passthrough_last < last) {572page_set_flags(passthrough_last + 1, last, page_flags);573}574}575shm_region_rm_complete(start, last);576trace_target_mmap_complete(start);577if (qemu_loglevel_mask(CPU_LOG_PAGE)) {578FILE *f = qemu_log_trylock();579if (f) {580fprintf(f, "page layout changed following mmap\n");581page_dump(f);582qemu_log_unlock(f);583}584}585return start;586}
587
588/*
589* Special case host page size == target page size,
590* where there are no edge conditions.
591*/
592static abi_long mmap_h_eq_g(abi_ulong start, abi_ulong len,593int host_prot, int flags, int page_flags,594int fd, off_t offset)595{
596void *p, *want_p = NULL;597abi_ulong last;598
599if (start || (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {600want_p = g2h_untagged(start);601}602
603p = mmap(want_p, len, host_prot, flags, fd, offset);604if (p == MAP_FAILED) {605return -1;606}607/* If the host kernel does not support MAP_FIXED_NOREPLACE, emulate. */608if ((flags & MAP_FIXED_NOREPLACE) && p != want_p) {609do_munmap(p, len);610errno = EEXIST;611return -1;612}613
614start = h2g(p);615last = start + len - 1;616return mmap_end(start, last, start, last, flags, page_flags);617}
618
619/*
620* Special case host page size < target page size.
621*
622* The two special cases are increased guest alignment, and mapping
623* past the end of a file.
624*
625* When mapping files into a memory area larger than the file,
626* accesses to pages beyond the file size will cause a SIGBUS.
627*
628* For example, if mmaping a file of 100 bytes on a host with 4K
629* pages emulating a target with 8K pages, the target expects to
630* be able to access the first 8K. But the host will trap us on
631* any access beyond 4K.
632*
633* When emulating a target with a larger page-size than the hosts,
634* we may need to truncate file maps at EOF and add extra anonymous
635* pages up to the targets page boundary.
636*
637* This workaround only works for files that do not change.
638* If the file is later extended (e.g. ftruncate), the SIGBUS
639* vanishes and the proper behaviour is that changes within the
640* anon page should be reflected in the file.
641*
642* However, this case is rather common with executable images,
643* so the workaround is important for even trivial tests, whereas
644* the mmap of of a file being extended is less common.
645*/
646static abi_long mmap_h_lt_g(abi_ulong start, abi_ulong len, int host_prot,647int mmap_flags, int page_flags, int fd,648off_t offset, int host_page_size)649{
650void *p, *want_p = NULL;651off_t fileend_adj = 0;652int flags = mmap_flags;653abi_ulong last, pass_last;654
655if (start || (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {656want_p = g2h_untagged(start);657}658
659if (!(flags & MAP_ANONYMOUS)) {660struct stat sb;661
662if (fstat(fd, &sb) == -1) {663return -1;664}665if (offset >= sb.st_size) {666/*667* The entire map is beyond the end of the file.
668* Transform it to an anonymous mapping.
669*/
670flags |= MAP_ANONYMOUS;671fd = -1;672offset = 0;673} else if (offset + len > sb.st_size) {674/*675* A portion of the map is beyond the end of the file.
676* Truncate the file portion of the allocation.
677*/
678fileend_adj = offset + len - sb.st_size;679}680}681
682if (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) {683if (fileend_adj) {684p = mmap(want_p, len, host_prot, flags | MAP_ANONYMOUS, -1, 0);685} else {686p = mmap(want_p, len, host_prot, flags, fd, offset);687}688if (p != want_p) {689if (p != MAP_FAILED) {690/* Host does not support MAP_FIXED_NOREPLACE: emulate. */691do_munmap(p, len);692errno = EEXIST;693}694return -1;695}696
697if (fileend_adj) {698void *t = mmap(p, len - fileend_adj, host_prot,699(flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED,700fd, offset);701
702if (t == MAP_FAILED) {703int save_errno = errno;704
705/*706* We failed a map over the top of the successful anonymous
707* mapping above. The only failure mode is running out of VMAs,
708* and there's nothing that we can do to detect that earlier.
709* If we have replaced an existing mapping with MAP_FIXED,
710* then we cannot properly recover. It's a coin toss whether
711* it would be better to exit or continue here.
712*/
713if (!(flags & MAP_FIXED_NOREPLACE) &&714!page_check_range_empty(start, start + len - 1)) {715qemu_log("QEMU target_mmap late failure: %s",716strerror(save_errno));717}718
719do_munmap(want_p, len);720errno = save_errno;721return -1;722}723}724} else {725size_t host_len, part_len;726
727/*728* Take care to align the host memory. Perform a larger anonymous
729* allocation and extract the aligned portion. Remap the file on
730* top of that.
731*/
732host_len = len + TARGET_PAGE_SIZE - host_page_size;733p = mmap(want_p, host_len, host_prot, flags | MAP_ANONYMOUS, -1, 0);734if (p == MAP_FAILED) {735return -1;736}737
738part_len = (uintptr_t)p & (TARGET_PAGE_SIZE - 1);739if (part_len) {740part_len = TARGET_PAGE_SIZE - part_len;741do_munmap(p, part_len);742p += part_len;743host_len -= part_len;744}745if (len < host_len) {746do_munmap(p + len, host_len - len);747}748
749if (!(flags & MAP_ANONYMOUS)) {750void *t = mmap(p, len - fileend_adj, host_prot,751flags | MAP_FIXED, fd, offset);752
753if (t == MAP_FAILED) {754int save_errno = errno;755do_munmap(p, len);756errno = save_errno;757return -1;758}759}760
761start = h2g(p);762}763
764last = start + len - 1;765if (fileend_adj) {766pass_last = ROUND_UP(last - fileend_adj, host_page_size) - 1;767} else {768pass_last = last;769}770return mmap_end(start, last, start, pass_last, mmap_flags, page_flags);771}
772
773/*
774* Special case host page size > target page size.
775*
776* The two special cases are address and file offsets that are valid
777* for the guest that cannot be directly represented by the host.
778*/
779static abi_long mmap_h_gt_g(abi_ulong start, abi_ulong len,780int target_prot, int host_prot,781int flags, int page_flags, int fd,782off_t offset, int host_page_size)783{
784void *p, *want_p = NULL;785off_t host_offset = offset & -host_page_size;786abi_ulong last, real_start, real_last;787bool misaligned_offset = false;788size_t host_len;789
790if (start || (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {791want_p = g2h_untagged(start);792}793
794if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {795/*796* Adjust the offset to something representable on the host.
797*/
798host_len = len + offset - host_offset;799p = mmap(want_p, host_len, host_prot, flags, fd, host_offset);800if (p == MAP_FAILED) {801return -1;802}803
804/* Update start to the file position at offset. */805p += offset - host_offset;806
807start = h2g(p);808last = start + len - 1;809return mmap_end(start, last, start, last, flags, page_flags);810}811
812if (!(flags & MAP_ANONYMOUS)) {813misaligned_offset = (start ^ offset) & (host_page_size - 1);814
815/*816* The fallback for misalignment is a private mapping + read.
817* This carries none of semantics required of MAP_SHARED.
818*/
819if (misaligned_offset && (flags & MAP_TYPE) != MAP_PRIVATE) {820errno = EINVAL;821return -1;822}823}824
825last = start + len - 1;826real_start = start & -host_page_size;827real_last = ROUND_UP(last, host_page_size) - 1;828
829/*830* Handle the start and end of the mapping.
831*/
832if (real_start < start) {833abi_ulong real_page_last = real_start + host_page_size - 1;834if (last <= real_page_last) {835/* Entire allocation a subset of one host page. */836if (!mmap_frag(real_start, start, last, target_prot,837flags, fd, offset)) {838return -1;839}840return mmap_end(start, last, -1, 0, flags, page_flags);841}842
843if (!mmap_frag(real_start, start, real_page_last, target_prot,844flags, fd, offset)) {845return -1;846}847real_start = real_page_last + 1;848}849
850if (last < real_last) {851abi_ulong real_page_start = real_last - host_page_size + 1;852if (!mmap_frag(real_page_start, real_page_start, last,853target_prot, flags, fd,854offset + real_page_start - start)) {855return -1;856}857real_last = real_page_start - 1;858}859
860if (real_start > real_last) {861return mmap_end(start, last, -1, 0, flags, page_flags);862}863
864/*865* Handle the middle of the mapping.
866*/
867
868host_len = real_last - real_start + 1;869want_p += real_start - start;870
871if (flags & MAP_ANONYMOUS) {872p = mmap(want_p, host_len, host_prot, flags, -1, 0);873} else if (!misaligned_offset) {874p = mmap(want_p, host_len, host_prot, flags, fd,875offset + real_start - start);876} else {877p = mmap(want_p, host_len, host_prot | PROT_WRITE,878flags | MAP_ANONYMOUS, -1, 0);879}880if (p != want_p) {881if (p != MAP_FAILED) {882do_munmap(p, host_len);883errno = EEXIST;884}885return -1;886}887
888if (misaligned_offset) {889if (!mmap_pread(fd, p, host_len, offset + real_start - start, false)) {890do_munmap(p, host_len);891return -1;892}893if (!(host_prot & PROT_WRITE)) {894mprotect(p, host_len, host_prot);895}896}897
898return mmap_end(start, last, -1, 0, flags, page_flags);899}
900
901static abi_long target_mmap__locked(abi_ulong start, abi_ulong len,902int target_prot, int flags, int page_flags,903int fd, off_t offset)904{
905int host_page_size = qemu_real_host_page_size();906int host_prot;907
908/*909* For reserved_va, we are in full control of the allocation.
910* Find a suitable hole and convert to MAP_FIXED.
911*/
912if (reserved_va) {913if (flags & MAP_FIXED_NOREPLACE) {914/* Validate that the chosen range is empty. */915if (!page_check_range_empty(start, start + len - 1)) {916errno = EEXIST;917return -1;918}919flags = (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED;920} else if (!(flags & MAP_FIXED)) {921abi_ulong real_start = start & -host_page_size;922off_t host_offset = offset & -host_page_size;923size_t real_len = len + offset - host_offset;924abi_ulong align = MAX(host_page_size, TARGET_PAGE_SIZE);925
926start = mmap_find_vma(real_start, real_len, align);927if (start == (abi_ulong)-1) {928errno = ENOMEM;929return -1;930}931start += offset - host_offset;932flags |= MAP_FIXED;933}934}935
936host_prot = target_to_host_prot(target_prot);937
938if (host_page_size == TARGET_PAGE_SIZE) {939return mmap_h_eq_g(start, len, host_prot, flags,940page_flags, fd, offset);941} else if (host_page_size < TARGET_PAGE_SIZE) {942return mmap_h_lt_g(start, len, host_prot, flags,943page_flags, fd, offset, host_page_size);944} else {945return mmap_h_gt_g(start, len, target_prot, host_prot, flags,946page_flags, fd, offset, host_page_size);947}948}
949
950/* NOTE: all the constants are the HOST ones */
951abi_long target_mmap(abi_ulong start, abi_ulong len, int target_prot,952int flags, int fd, off_t offset)953{
954abi_long ret;955int page_flags;956
957trace_target_mmap(start, len, target_prot, flags, fd, offset);958
959if (!len) {960errno = EINVAL;961return -1;962}963
964page_flags = validate_prot_to_pageflags(target_prot);965if (!page_flags) {966errno = EINVAL;967return -1;968}969
970/* Also check for overflows... */971len = TARGET_PAGE_ALIGN(len);972if (!len || len != (size_t)len) {973errno = ENOMEM;974return -1;975}976
977if (offset & ~TARGET_PAGE_MASK) {978errno = EINVAL;979return -1;980}981if (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE)) {982if (start & ~TARGET_PAGE_MASK) {983errno = EINVAL;984return -1;985}986if (!guest_range_valid_untagged(start, len)) {987errno = ENOMEM;988return -1;989}990}991
992mmap_lock();993
994ret = target_mmap__locked(start, len, target_prot, flags,995page_flags, fd, offset);996
997mmap_unlock();998
999/*1000* If we're mapping shared memory, ensure we generate code for parallel
1001* execution and flush old translations. This will work up to the level
1002* supported by the host -- anything that requires EXCP_ATOMIC will not
1003* be atomic with respect to an external process.
1004*/
1005if (ret != -1 && (flags & MAP_TYPE) != MAP_PRIVATE) {1006CPUState *cpu = thread_cpu;1007if (!tcg_cflags_has(cpu, CF_PARALLEL)) {1008tcg_cflags_set(cpu, CF_PARALLEL);1009tb_flush(cpu);1010}1011}1012
1013return ret;1014}
1015
1016static int mmap_reserve_or_unmap(abi_ulong start, abi_ulong len)1017{
1018int host_page_size = qemu_real_host_page_size();1019abi_ulong real_start;1020abi_ulong real_last;1021abi_ulong real_len;1022abi_ulong last;1023abi_ulong a;1024void *host_start;1025int prot;1026
1027last = start + len - 1;1028real_start = start & -host_page_size;1029real_last = ROUND_UP(last, host_page_size) - 1;1030
1031/*1032* If guest pages remain on the first or last host pages,
1033* adjust the deallocation to retain those guest pages.
1034* The single page special case is required for the last page,
1035* lest real_start overflow to zero.
1036*/
1037if (real_last - real_start < host_page_size) {1038prot = 0;1039for (a = real_start; a < start; a += TARGET_PAGE_SIZE) {1040prot |= page_get_flags(a);1041}1042for (a = last; a < real_last; a += TARGET_PAGE_SIZE) {1043prot |= page_get_flags(a + 1);1044}1045if (prot != 0) {1046return 0;1047}1048} else {1049for (prot = 0, a = real_start; a < start; a += TARGET_PAGE_SIZE) {1050prot |= page_get_flags(a);1051}1052if (prot != 0) {1053real_start += host_page_size;1054}1055
1056for (prot = 0, a = last; a < real_last; a += TARGET_PAGE_SIZE) {1057prot |= page_get_flags(a + 1);1058}1059if (prot != 0) {1060real_last -= host_page_size;1061}1062
1063if (real_last < real_start) {1064return 0;1065}1066}1067
1068real_len = real_last - real_start + 1;1069host_start = g2h_untagged(real_start);1070
1071return do_munmap(host_start, real_len);1072}
1073
1074int target_munmap(abi_ulong start, abi_ulong len)1075{
1076int ret;1077
1078trace_target_munmap(start, len);1079
1080if (start & ~TARGET_PAGE_MASK) {1081errno = EINVAL;1082return -1;1083}1084len = TARGET_PAGE_ALIGN(len);1085if (len == 0 || !guest_range_valid_untagged(start, len)) {1086errno = EINVAL;1087return -1;1088}1089
1090mmap_lock();1091ret = mmap_reserve_or_unmap(start, len);1092if (likely(ret == 0)) {1093page_set_flags(start, start + len - 1, 0);1094shm_region_rm_complete(start, start + len - 1);1095}1096mmap_unlock();1097
1098return ret;1099}
1100
1101abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,1102abi_ulong new_size, unsigned long flags,1103abi_ulong new_addr)1104{
1105int prot;1106void *host_addr;1107
1108if (!guest_range_valid_untagged(old_addr, old_size) ||1109((flags & MREMAP_FIXED) &&1110!guest_range_valid_untagged(new_addr, new_size)) ||1111((flags & MREMAP_MAYMOVE) == 0 &&1112!guest_range_valid_untagged(old_addr, new_size))) {1113errno = ENOMEM;1114return -1;1115}1116
1117mmap_lock();1118
1119if (flags & MREMAP_FIXED) {1120host_addr = mremap(g2h_untagged(old_addr), old_size, new_size,1121flags, g2h_untagged(new_addr));1122
1123if (reserved_va && host_addr != MAP_FAILED) {1124/*1125* If new and old addresses overlap then the above mremap will
1126* already have failed with EINVAL.
1127*/
1128mmap_reserve_or_unmap(old_addr, old_size);1129}1130} else if (flags & MREMAP_MAYMOVE) {1131abi_ulong mmap_start;1132
1133mmap_start = mmap_find_vma(0, new_size, TARGET_PAGE_SIZE);1134
1135if (mmap_start == -1) {1136errno = ENOMEM;1137host_addr = MAP_FAILED;1138} else {1139host_addr = mremap(g2h_untagged(old_addr), old_size, new_size,1140flags | MREMAP_FIXED,1141g2h_untagged(mmap_start));1142if (reserved_va) {1143mmap_reserve_or_unmap(old_addr, old_size);1144}1145}1146} else {1147int page_flags = 0;1148if (reserved_va && old_size < new_size) {1149abi_ulong addr;1150for (addr = old_addr + old_size;1151addr < old_addr + new_size;1152addr++) {1153page_flags |= page_get_flags(addr);1154}1155}1156if (page_flags == 0) {1157host_addr = mremap(g2h_untagged(old_addr),1158old_size, new_size, flags);1159
1160if (host_addr != MAP_FAILED) {1161/* Check if address fits target address space */1162if (!guest_range_valid_untagged(h2g(host_addr), new_size)) {1163/* Revert mremap() changes */1164host_addr = mremap(g2h_untagged(old_addr),1165new_size, old_size, flags);1166errno = ENOMEM;1167host_addr = MAP_FAILED;1168} else if (reserved_va && old_size > new_size) {1169mmap_reserve_or_unmap(old_addr + old_size,1170old_size - new_size);1171}1172}1173} else {1174errno = ENOMEM;1175host_addr = MAP_FAILED;1176}1177}1178
1179if (host_addr == MAP_FAILED) {1180new_addr = -1;1181} else {1182new_addr = h2g(host_addr);1183prot = page_get_flags(old_addr);1184page_set_flags(old_addr, old_addr + old_size - 1, 0);1185shm_region_rm_complete(old_addr, old_addr + old_size - 1);1186page_set_flags(new_addr, new_addr + new_size - 1,1187prot | PAGE_VALID | PAGE_RESET);1188shm_region_rm_complete(new_addr, new_addr + new_size - 1);1189}1190mmap_unlock();1191return new_addr;1192}
1193
1194abi_long target_madvise(abi_ulong start, abi_ulong len_in, int advice)1195{
1196abi_ulong len;1197int ret = 0;1198
1199if (start & ~TARGET_PAGE_MASK) {1200return -TARGET_EINVAL;1201}1202if (len_in == 0) {1203return 0;1204}1205len = TARGET_PAGE_ALIGN(len_in);1206if (len == 0 || !guest_range_valid_untagged(start, len)) {1207return -TARGET_EINVAL;1208}1209
1210/* Translate for some architectures which have different MADV_xxx values */1211switch (advice) {1212case TARGET_MADV_DONTNEED: /* alpha */1213advice = MADV_DONTNEED;1214break;1215case TARGET_MADV_WIPEONFORK: /* parisc */1216advice = MADV_WIPEONFORK;1217break;1218case TARGET_MADV_KEEPONFORK: /* parisc */1219advice = MADV_KEEPONFORK;1220break;1221/* we do not care about the other MADV_xxx values yet */1222}1223
1224/*1225* Most advice values are hints, so ignoring and returning success is ok.
1226*
1227* However, some advice values such as MADV_DONTNEED, MADV_WIPEONFORK and
1228* MADV_KEEPONFORK are not hints and need to be emulated.
1229*
1230* A straight passthrough for those may not be safe because qemu sometimes
1231* turns private file-backed mappings into anonymous mappings.
1232* If all guest pages have PAGE_PASSTHROUGH set, mappings have the
1233* same semantics for the host as for the guest.
1234*
1235* We pass through MADV_WIPEONFORK and MADV_KEEPONFORK if possible and
1236* return failure if not.
1237*
1238* MADV_DONTNEED is passed through as well, if possible.
1239* If passthrough isn't possible, we nevertheless (wrongly!) return
1240* success, which is broken but some userspace programs fail to work
1241* otherwise. Completely implementing such emulation is quite complicated
1242* though.
1243*/
1244mmap_lock();1245switch (advice) {1246case MADV_WIPEONFORK:1247case MADV_KEEPONFORK:1248ret = -EINVAL;1249/* fall through */1250case MADV_DONTNEED:1251if (page_check_range(start, len, PAGE_PASSTHROUGH)) {1252ret = get_errno(madvise(g2h_untagged(start), len, advice));1253if ((advice == MADV_DONTNEED) && (ret == 0)) {1254page_reset_target_data(start, start + len - 1);1255}1256}1257}1258mmap_unlock();1259
1260return ret;1261}
1262
1263#ifndef TARGET_FORCE_SHMLBA1264/*
1265* For most architectures, SHMLBA is the same as the page size;
1266* some architectures have larger values, in which case they should
1267* define TARGET_FORCE_SHMLBA and provide a target_shmlba() function.
1268* This corresponds to the kernel arch code defining __ARCH_FORCE_SHMLBA
1269* and defining its own value for SHMLBA.
1270*
1271* The kernel also permits SHMLBA to be set by the architecture to a
1272* value larger than the page size without setting __ARCH_FORCE_SHMLBA;
1273* this means that addresses are rounded to the large size if
1274* SHM_RND is set but addresses not aligned to that size are not rejected
1275* as long as they are at least page-aligned. Since the only architecture
1276* which uses this is ia64 this code doesn't provide for that oddity.
1277*/
1278static inline abi_ulong target_shmlba(CPUArchState *cpu_env)1279{
1280return TARGET_PAGE_SIZE;1281}
1282#endif1283
1284#if defined(__arm__) || defined(__mips__) || defined(__sparc__)1285#define HOST_FORCE_SHMLBA 11286#else1287#define HOST_FORCE_SHMLBA 01288#endif1289
1290abi_ulong target_shmat(CPUArchState *cpu_env, int shmid,1291abi_ulong shmaddr, int shmflg)1292{
1293CPUState *cpu = env_cpu(cpu_env);1294struct shmid_ds shm_info;1295int ret;1296int h_pagesize;1297int t_shmlba, h_shmlba, m_shmlba;1298size_t t_len, h_len, m_len;1299
1300/* shmat pointers are always untagged */1301
1302/*1303* Because we can't use host shmat() unless the address is sufficiently
1304* aligned for the host, we'll need to check both.
1305* TODO: Could be fixed with softmmu.
1306*/
1307t_shmlba = target_shmlba(cpu_env);1308h_pagesize = qemu_real_host_page_size();1309h_shmlba = (HOST_FORCE_SHMLBA ? SHMLBA : h_pagesize);1310m_shmlba = MAX(t_shmlba, h_shmlba);1311
1312if (shmaddr) {1313if (shmaddr & (m_shmlba - 1)) {1314if (shmflg & SHM_RND) {1315/*1316* The guest is allowing the kernel to round the address.
1317* Assume that the guest is ok with us rounding to the
1318* host required alignment too. Anyway if we don't, we'll
1319* get an error from the kernel.
1320*/
1321shmaddr &= ~(m_shmlba - 1);1322if (shmaddr == 0 && (shmflg & SHM_REMAP)) {1323return -TARGET_EINVAL;1324}1325} else {1326int require = TARGET_PAGE_SIZE;1327#ifdef TARGET_FORCE_SHMLBA1328require = t_shmlba;1329#endif1330/*1331* Include host required alignment, as otherwise we cannot
1332* use host shmat at all.
1333*/
1334require = MAX(require, h_shmlba);1335if (shmaddr & (require - 1)) {1336return -TARGET_EINVAL;1337}1338}1339}1340} else {1341if (shmflg & SHM_REMAP) {1342return -TARGET_EINVAL;1343}1344}1345/* All rounding now manually concluded. */1346shmflg &= ~SHM_RND;1347
1348/* Find out the length of the shared memory segment. */1349ret = get_errno(shmctl(shmid, IPC_STAT, &shm_info));1350if (is_error(ret)) {1351/* can't get length, bail out */1352return ret;1353}1354t_len = TARGET_PAGE_ALIGN(shm_info.shm_segsz);1355h_len = ROUND_UP(shm_info.shm_segsz, h_pagesize);1356m_len = MAX(t_len, h_len);1357
1358if (!guest_range_valid_untagged(shmaddr, m_len)) {1359return -TARGET_EINVAL;1360}1361
1362WITH_MMAP_LOCK_GUARD() {1363bool mapped = false;1364void *want, *test;1365abi_ulong last;1366
1367if (!shmaddr) {1368shmaddr = mmap_find_vma(0, m_len, m_shmlba);1369if (shmaddr == -1) {1370return -TARGET_ENOMEM;1371}1372mapped = !reserved_va;1373} else if (shmflg & SHM_REMAP) {1374/*1375* If host page size > target page size, the host shmat may map
1376* more memory than the guest expects. Reject a mapping that
1377* would replace memory in the unexpected gap.
1378* TODO: Could be fixed with softmmu.
1379*/
1380if (t_len < h_len &&1381!page_check_range_empty(shmaddr + t_len,1382shmaddr + h_len - 1)) {1383return -TARGET_EINVAL;1384}1385} else {1386if (!page_check_range_empty(shmaddr, shmaddr + m_len - 1)) {1387return -TARGET_EINVAL;1388}1389}1390
1391/* All placement is now complete. */1392want = (void *)g2h_untagged(shmaddr);1393
1394/*1395* Map anonymous pages across the entire range, then remap with
1396* the shared memory. This is required for a number of corner
1397* cases for which host and guest page sizes differ.
1398*/
1399if (h_len != t_len) {1400int mmap_p = PROT_READ | (shmflg & SHM_RDONLY ? 0 : PROT_WRITE);1401int mmap_f = MAP_PRIVATE | MAP_ANONYMOUS1402| (reserved_va || mapped || (shmflg & SHM_REMAP)1403? MAP_FIXED : MAP_FIXED_NOREPLACE);1404
1405test = mmap(want, m_len, mmap_p, mmap_f, -1, 0);1406if (unlikely(test != want)) {1407/* shmat returns EINVAL not EEXIST like mmap. */1408ret = (test == MAP_FAILED && errno != EEXIST1409? get_errno(-1) : -TARGET_EINVAL);1410if (mapped) {1411do_munmap(want, m_len);1412}1413return ret;1414}1415mapped = true;1416}1417
1418if (reserved_va || mapped) {1419shmflg |= SHM_REMAP;1420}1421test = shmat(shmid, want, shmflg);1422if (test == MAP_FAILED) {1423ret = get_errno(-1);1424if (mapped) {1425do_munmap(want, m_len);1426}1427return ret;1428}1429assert(test == want);1430
1431last = shmaddr + m_len - 1;1432page_set_flags(shmaddr, last,1433PAGE_VALID | PAGE_RESET | PAGE_READ |1434(shmflg & SHM_RDONLY ? 0 : PAGE_WRITE) |1435(shmflg & SHM_EXEC ? PAGE_EXEC : 0));1436
1437shm_region_rm_complete(shmaddr, last);1438shm_region_add(shmaddr, last);1439}1440
1441/*1442* We're mapping shared memory, so ensure we generate code for parallel
1443* execution and flush old translations. This will work up to the level
1444* supported by the host -- anything that requires EXCP_ATOMIC will not
1445* be atomic with respect to an external process.
1446*/
1447if (!tcg_cflags_has(cpu, CF_PARALLEL)) {1448tcg_cflags_set(cpu, CF_PARALLEL);1449tb_flush(cpu);1450}1451
1452if (qemu_loglevel_mask(CPU_LOG_PAGE)) {1453FILE *f = qemu_log_trylock();1454if (f) {1455fprintf(f, "page layout changed following shmat\n");1456page_dump(f);1457qemu_log_unlock(f);1458}1459}1460return shmaddr;1461}
1462
1463abi_long target_shmdt(abi_ulong shmaddr)1464{
1465abi_long rv;1466
1467/* shmdt pointers are always untagged */1468
1469WITH_MMAP_LOCK_GUARD() {1470abi_ulong last = shm_region_find(shmaddr);1471if (last == 0) {1472return -TARGET_EINVAL;1473}1474
1475rv = get_errno(shmdt(g2h_untagged(shmaddr)));1476if (rv == 0) {1477abi_ulong size = last - shmaddr + 1;1478
1479page_set_flags(shmaddr, last, 0);1480shm_region_rm_complete(shmaddr, last);1481mmap_reserve_or_unmap(shmaddr, size);1482}1483}1484return rv;1485}
1486