qemu
1/*
2* QEMU low level functions
3*
4* Copyright (c) 2003 Fabrice Bellard
5*
6* Permission is hereby granted, free of charge, to any person obtaining a copy
7* of this software and associated documentation files (the "Software"), to deal
8* in the Software without restriction, including without limitation the rights
9* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10* copies of the Software, and to permit persons to whom the Software is
11* furnished to do so, subject to the following conditions:
12*
13* The above copyright notice and this permission notice shall be included in
14* all copies or substantial portions of the Software.
15*
16* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22* THE SOFTWARE.
23*/
24#include "qemu/osdep.h"25#include "qapi/error.h"26#include "qemu/cutils.h"27#include "qemu/sockets.h"28#include "qemu/error-report.h"29#include "qemu/madvise.h"30#include "qemu/mprotect.h"31#include "qemu/hw-version.h"32#include "monitor/monitor.h"33
34static const char *hw_version = QEMU_HW_VERSION;35
36int socket_set_cork(int fd, int v)37{
38#if defined(SOL_TCP) && defined(TCP_CORK)39return setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));40#else41return 0;42#endif43}
44
45int socket_set_nodelay(int fd)46{
47int v = 1;48return setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &v, sizeof(v));49}
50
51int qemu_madvise(void *addr, size_t len, int advice)52{
53if (advice == QEMU_MADV_INVALID) {54errno = EINVAL;55return -1;56}57#if defined(CONFIG_MADVISE)58return madvise(addr, len, advice);59#elif defined(CONFIG_POSIX_MADVISE)60int rc = posix_madvise(addr, len, advice);61if (rc) {62errno = rc;63return -1;64}65return 0;66#else67errno = ENOSYS;68return -1;69#endif70}
71
72static int qemu_mprotect__osdep(void *addr, size_t size, int prot)73{
74g_assert(!((uintptr_t)addr & ~qemu_real_host_page_mask()));75g_assert(!(size & ~qemu_real_host_page_mask()));76
77#ifdef _WIN3278DWORD old_protect;79
80if (!VirtualProtect(addr, size, prot, &old_protect)) {81g_autofree gchar *emsg = g_win32_error_message(GetLastError());82error_report("%s: VirtualProtect failed: %s", __func__, emsg);83return -1;84}85return 0;86#else87if (mprotect(addr, size, prot)) {88error_report("%s: mprotect failed: %s", __func__, strerror(errno));89return -1;90}91return 0;92#endif93}
94
95int qemu_mprotect_rw(void *addr, size_t size)96{
97#ifdef _WIN3298return qemu_mprotect__osdep(addr, size, PAGE_READWRITE);99#else100return qemu_mprotect__osdep(addr, size, PROT_READ | PROT_WRITE);101#endif102}
103
104int qemu_mprotect_rwx(void *addr, size_t size)105{
106#ifdef _WIN32107return qemu_mprotect__osdep(addr, size, PAGE_EXECUTE_READWRITE);108#else109return qemu_mprotect__osdep(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC);110#endif111}
112
113int qemu_mprotect_none(void *addr, size_t size)114{
115#ifdef _WIN32116return qemu_mprotect__osdep(addr, size, PAGE_NOACCESS);117#else118return qemu_mprotect__osdep(addr, size, PROT_NONE);119#endif120}
121
122#ifndef _WIN32123
124static int fcntl_op_setlk = -1;125static int fcntl_op_getlk = -1;126
127/*
128* Dups an fd and sets the flags
129*/
130int qemu_dup_flags(int fd, int flags)131{
132int ret;133int serrno;134int dup_flags;135
136ret = qemu_dup(fd);137if (ret == -1) {138goto fail;139}140
141dup_flags = fcntl(ret, F_GETFL);142if (dup_flags == -1) {143goto fail;144}145
146if ((flags & O_SYNC) != (dup_flags & O_SYNC)) {147errno = EINVAL;148goto fail;149}150
151/* Set/unset flags that we can with fcntl */152if (fcntl(ret, F_SETFL, flags) == -1) {153goto fail;154}155
156/* Truncate the file in the cases that open() would truncate it */157if (flags & O_TRUNC ||158((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))) {159if (ftruncate(ret, 0) == -1) {160goto fail;161}162}163
164return ret;165
166fail:167serrno = errno;168if (ret != -1) {169close(ret);170}171errno = serrno;172return -1;173}
174
175int qemu_dup(int fd)176{
177int ret;178#ifdef F_DUPFD_CLOEXEC179ret = fcntl(fd, F_DUPFD_CLOEXEC, 0);180#else181ret = dup(fd);182if (ret != -1) {183qemu_set_cloexec(ret);184}185#endif186return ret;187}
188
189static int qemu_parse_fdset(const char *param)190{
191return qemu_parse_fd(param);192}
193
194static void qemu_probe_lock_ops(void)195{
196if (fcntl_op_setlk == -1) {197#ifdef F_OFD_SETLK198int fd;199int ret;200struct flock fl = {201.l_whence = SEEK_SET,202.l_start = 0,203.l_len = 0,204.l_type = F_WRLCK,205};206
207fd = open("/dev/null", O_RDWR);208if (fd < 0) {209fprintf(stderr,210"Failed to open /dev/null for OFD lock probing: %s\n",211strerror(errno));212fcntl_op_setlk = F_SETLK;213fcntl_op_getlk = F_GETLK;214return;215}216ret = fcntl(fd, F_OFD_GETLK, &fl);217close(fd);218if (!ret) {219fcntl_op_setlk = F_OFD_SETLK;220fcntl_op_getlk = F_OFD_GETLK;221} else {222fcntl_op_setlk = F_SETLK;223fcntl_op_getlk = F_GETLK;224}225#else226fcntl_op_setlk = F_SETLK;227fcntl_op_getlk = F_GETLK;228#endif229}230}
231
232bool qemu_has_ofd_lock(void)233{
234qemu_probe_lock_ops();235#ifdef F_OFD_SETLK236return fcntl_op_setlk == F_OFD_SETLK;237#else238return false;239#endif240}
241
242static int qemu_lock_fcntl(int fd, int64_t start, int64_t len, int fl_type)243{
244int ret;245struct flock fl = {246.l_whence = SEEK_SET,247.l_start = start,248.l_len = len,249.l_type = fl_type,250};251qemu_probe_lock_ops();252ret = RETRY_ON_EINTR(fcntl(fd, fcntl_op_setlk, &fl));253return ret == -1 ? -errno : 0;254}
255
256int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive)257{
258return qemu_lock_fcntl(fd, start, len, exclusive ? F_WRLCK : F_RDLCK);259}
260
261int qemu_unlock_fd(int fd, int64_t start, int64_t len)262{
263return qemu_lock_fcntl(fd, start, len, F_UNLCK);264}
265
266int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive)267{
268int ret;269struct flock fl = {270.l_whence = SEEK_SET,271.l_start = start,272.l_len = len,273.l_type = exclusive ? F_WRLCK : F_RDLCK,274};275qemu_probe_lock_ops();276ret = fcntl(fd, fcntl_op_getlk, &fl);277if (ret == -1) {278return -errno;279} else {280return fl.l_type == F_UNLCK ? 0 : -EAGAIN;281}282}
283#endif284
285bool qemu_has_direct_io(void)286{
287#ifdef O_DIRECT288return true;289#else290return false;291#endif292}
293
294static int qemu_open_cloexec(const char *name, int flags, mode_t mode)295{
296int ret;297#ifdef O_CLOEXEC298ret = open(name, flags | O_CLOEXEC, mode);299#else300ret = open(name, flags, mode);301if (ret >= 0) {302qemu_set_cloexec(ret);303}304#endif305return ret;306}
307
308/*
309* Opens a file with FD_CLOEXEC set
310*/
311static int312qemu_open_internal(const char *name, int flags, mode_t mode, Error **errp)313{
314int ret;315
316#ifndef _WIN32317const char *fdset_id_str;318
319/* Attempt dup of fd from fd set */320if (strstart(name, "/dev/fdset/", &fdset_id_str)) {321int64_t fdset_id;322
323fdset_id = qemu_parse_fdset(fdset_id_str);324if (fdset_id == -1) {325error_setg(errp, "Could not parse fdset %s", name);326errno = EINVAL;327return -1;328}329
330return monitor_fdset_dup_fd_add(fdset_id, flags, errp);331}332#endif333
334ret = qemu_open_cloexec(name, flags, mode);335
336if (ret == -1) {337const char *action = flags & O_CREAT ? "create" : "open";338#ifdef O_DIRECT339/* Give more helpful error message for O_DIRECT */340if (errno == EINVAL && (flags & O_DIRECT)) {341ret = open(name, flags & ~O_DIRECT, mode);342if (ret != -1) {343close(ret);344error_setg(errp, "Could not %s '%s': "345"filesystem does not support O_DIRECT",346action, name);347errno = EINVAL; /* restore first open()'s errno */348return -1;349}350}351#endif /* O_DIRECT */352error_setg_errno(errp, errno, "Could not %s '%s'",353action, name);354}355
356return ret;357}
358
359
360int qemu_open(const char *name, int flags, Error **errp)361{
362assert(!(flags & O_CREAT));363
364return qemu_open_internal(name, flags, 0, errp);365}
366
367
368int qemu_create(const char *name, int flags, mode_t mode, Error **errp)369{
370assert(!(flags & O_CREAT));371
372return qemu_open_internal(name, flags | O_CREAT, mode, errp);373}
374
375
376int qemu_open_old(const char *name, int flags, ...)377{
378va_list ap;379mode_t mode = 0;380int ret;381
382va_start(ap, flags);383if (flags & O_CREAT) {384mode = va_arg(ap, int);385}386va_end(ap);387
388ret = qemu_open_internal(name, flags, mode, NULL);389
390#ifdef O_DIRECT391if (ret == -1 && errno == EINVAL && (flags & O_DIRECT)) {392error_report("file system may not support O_DIRECT");393errno = EINVAL; /* in case it was clobbered */394}395#endif /* O_DIRECT */396
397return ret;398}
399
400int qemu_close(int fd)401{
402/* Close fd that was dup'd from an fdset */403monitor_fdset_dup_fd_remove(fd);404return close(fd);405}
406
407/*
408* Delete a file from the filesystem, unless the filename is /dev/fdset/...
409*
410* Returns: On success, zero is returned. On error, -1 is returned,
411* and errno is set appropriately.
412*/
413int qemu_unlink(const char *name)414{
415if (g_str_has_prefix(name, "/dev/fdset/")) {416return 0;417}418
419return unlink(name);420}
421
422/*
423* A variant of write(2) which handles partial write.
424*
425* Return the number of bytes transferred.
426* Set errno if fewer than `count' bytes are written.
427*
428* This function don't work with non-blocking fd's.
429* Any of the possibilities with non-blocking fd's is bad:
430* - return a short write (then name is wrong)
431* - busy wait adding (errno == EAGAIN) to the loop
432*/
433ssize_t qemu_write_full(int fd, const void *buf, size_t count)434{
435ssize_t ret = 0;436ssize_t total = 0;437
438while (count) {439ret = write(fd, buf, count);440if (ret < 0) {441if (errno == EINTR)442continue;443break;444}445
446count -= ret;447buf += ret;448total += ret;449}450
451return total;452}
453
454/*
455* Opens a socket with FD_CLOEXEC set
456*/
457int qemu_socket(int domain, int type, int protocol)458{
459int ret;460
461#ifdef SOCK_CLOEXEC462ret = socket(domain, type | SOCK_CLOEXEC, protocol);463if (ret != -1 || errno != EINVAL) {464return ret;465}466#endif467ret = socket(domain, type, protocol);468if (ret >= 0) {469qemu_set_cloexec(ret);470}471
472return ret;473}
474
475/*
476* Accept a connection and set FD_CLOEXEC
477*/
478int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen)479{
480int ret;481
482#ifdef CONFIG_ACCEPT4483ret = accept4(s, addr, addrlen, SOCK_CLOEXEC);484if (ret != -1 || errno != ENOSYS) {485return ret;486}487#endif488ret = accept(s, addr, addrlen);489if (ret >= 0) {490qemu_set_cloexec(ret);491}492
493return ret;494}
495
496ssize_t qemu_send_full(int s, const void *buf, size_t count)497{
498ssize_t ret = 0;499ssize_t total = 0;500
501while (count) {502ret = send(s, buf, count, 0);503if (ret < 0) {504if (errno == EINTR) {505continue;506}507break;508}509
510count -= ret;511buf += ret;512total += ret;513}514
515return total;516}
517
518void qemu_set_hw_version(const char *version)519{
520hw_version = version;521}
522
523const char *qemu_hw_version(void)524{
525return hw_version;526}
527
528#ifdef _WIN32529static void socket_cleanup(void)530{
531WSACleanup();532}
533#endif534
535int socket_init(void)536{
537#ifdef _WIN32538WSADATA Data;539int ret, err;540
541ret = WSAStartup(MAKEWORD(2, 2), &Data);542if (ret != 0) {543err = WSAGetLastError();544fprintf(stderr, "WSAStartup: %d\n", err);545return -1;546}547atexit(socket_cleanup);548#endif549return 0;550}
551
552
553#ifndef CONFIG_IOVEC554static ssize_t555readv_writev(int fd, const struct iovec *iov, int iov_cnt, bool do_write)556{
557unsigned i = 0;558ssize_t ret = 0;559ssize_t off = 0;560while (i < iov_cnt) {561ssize_t r = do_write562? write(fd, iov[i].iov_base + off, iov[i].iov_len - off)563: read(fd, iov[i].iov_base + off, iov[i].iov_len - off);564if (r > 0) {565ret += r;566off += r;567if (off < iov[i].iov_len) {568continue;569}570} else if (!r) {571break;572} else if (errno == EINTR) {573continue;574} else {575/* else it is some "other" error,576* only return if there was no data processed. */
577if (ret == 0) {578ret = -1;579}580break;581}582off = 0;583i++;584}585return ret;586}
587
588ssize_t
589readv(int fd, const struct iovec *iov, int iov_cnt)590{
591return readv_writev(fd, iov, iov_cnt, false);592}
593
594ssize_t
595writev(int fd, const struct iovec *iov, int iov_cnt)596{
597return readv_writev(fd, iov, iov_cnt, true);598}
599#endif600
601/*
602* Make sure data goes on disk, but if possible do not bother to
603* write out the inode just for timestamp updates.
604*
605* Unfortunately even in 2009 many operating systems do not support
606* fdatasync and have to fall back to fsync.
607*/
608int qemu_fdatasync(int fd)609{
610#ifdef CONFIG_FDATASYNC611return fdatasync(fd);612#else613return fsync(fd);614#endif615}
616