diff options
Diffstat (limited to 'arch/um')
121 files changed, 3187 insertions, 3497 deletions
diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 93a5a8999b07..79509c7f39de 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -5,13 +5,14 @@ menu "UML-specific options" config UML bool default y + select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_HAS_CPU_FINALIZE_INIT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV select ARCH_HAS_STRNCPY_FROM_USER select ARCH_HAS_STRNLEN_USER - select ARCH_NO_PREEMPT + select ARCH_HAS_STRICT_KERNEL_RWX select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN @@ -31,7 +32,10 @@ config UML select TRACE_IRQFLAGS_SUPPORT select TTY # Needed for line.c select HAVE_ARCH_VMAP_STACK - select HAVE_RUST if X86_64 + select HAVE_RUST + select ARCH_HAS_UBSAN + select HAVE_ARCH_TRACEHOOK + select THREAD_INFO_IN_TASK config MMU bool @@ -48,12 +52,13 @@ config NO_IOMEM config UML_IOMEM_EMULATION bool select INDIRECT_IOMEM + select HAS_IOPORT select GENERIC_PCI_IOMAP select GENERIC_IOMAP select NO_GENERIC_PCI_IOPORT_MAP config NO_IOPORT_MAP - def_bool y + def_bool !UML_IOMEM_EMULATION config ISA bool @@ -93,7 +98,7 @@ config MAY_HAVE_RUNTIME_DEPS config STATIC_LINK bool "Force a static link" - depends on CC_CAN_LINK_STATIC_NO_RUNTIME_DEPS || !MAY_HAVE_RUNTIME_DEPS + depends on !MAY_HAVE_RUNTIME_DEPS help This option gives you the ability to force a static link of UML. Normally, UML is linked as a shared binary. This is inconvenient for @@ -208,8 +213,8 @@ config MMAPPER config PGTABLE_LEVELS int - default 3 if 3_LEVEL_PGTABLES - default 2 + default 4 if 64BIT + default 2 if !64BIT config UML_TIME_TRAVEL_SUPPORT bool @@ -226,6 +231,21 @@ config UML_TIME_TRAVEL_SUPPORT It is safe to say Y, but you probably don't need this. +config UML_MAX_USERSPACE_ITERATIONS + int + prompt "Maximum number of unscheduled userspace iterations" + default 10000 + depends on UML_TIME_TRAVEL_SUPPORT + help + In UML inf-cpu and ext time-travel mode userspace can run without being + interrupted. This will eventually overwhelm the kernel and create OOM + situations (mainly RCU not running). This setting specifies the number + of kernel/userspace switches (minor/major page fault, signal or syscall) + for the same userspace thread before the sched_clock is advanced by a + jiffie to trigger scheduling. + + Setting it to zero disables the feature. + config KASAN_SHADOW_OFFSET hex depends on KASAN diff --git a/arch/um/Makefile b/arch/um/Makefile index 34957dcb88b9..1d36a613aad8 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -20,7 +20,7 @@ endif ARCH_DIR := arch/um # We require bash because the vmlinux link and loader script cpp use bash # features. -SHELL := /bin/bash +SHELL := bash MODE_INCLUDE += -I$(srctree)/$(ARCH_DIR)/include/shared/skas @@ -61,7 +61,8 @@ KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ \ $(ARCH_INCLUDE) $(MODE_INCLUDE) -Dvmap=kernel_vmap \ -Dlongjmp=kernel_longjmp -Dsetjmp=kernel_setjmp \ -Din6addr_loopback=kernel_in6addr_loopback \ - -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr + -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr \ + -D__close_range=kernel__close_range KBUILD_RUSTFLAGS += -Crelocation-model=pie @@ -70,7 +71,9 @@ KBUILD_AFLAGS += $(ARCH_INCLUDE) USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -I%,,$(KBUILD_CFLAGS))) \ $(ARCH_INCLUDE) $(MODE_INCLUDE) $(filter -I%,$(CFLAGS)) \ -D_FILE_OFFSET_BITS=64 -idirafter $(srctree)/include \ - -idirafter $(objtree)/include -D__KERNEL__ -D__UM_HOST__ + -idirafter $(objtree)/include -D__KERNEL__ -D__UM_HOST__ \ + -include $(srctree)/include/linux/compiler-version.h \ + -include $(srctree)/include/linux/kconfig.h #This will adjust *FLAGS accordingly to the platform. include $(srctree)/$(ARCH_DIR)/Makefile-os-Linux diff --git a/arch/um/Makefile-skas b/arch/um/Makefile-skas index 67323b028999..1a27e65bcb9c 100644 --- a/arch/um/Makefile-skas +++ b/arch/um/Makefile-skas @@ -3,15 +3,15 @@ # Licensed under the GPL # -GPROF_OPT += -pg +export UM_GPROF_OPT += -pg ifdef CONFIG_CC_IS_CLANG -GCOV_OPT += -fprofile-instr-generate -fcoverage-mapping +export UM_GCOV_OPT += -fprofile-instr-generate -fcoverage-mapping else -GCOV_OPT += -fprofile-arcs -ftest-coverage +export UM_GCOV_OPT += -fprofile-arcs -ftest-coverage endif -CFLAGS-$(CONFIG_GCOV) += $(GCOV_OPT) -CFLAGS-$(CONFIG_GPROF) += $(GPROF_OPT) -LINK-$(CONFIG_GCOV) += $(GCOV_OPT) -LINK-$(CONFIG_GPROF) += $(GPROF_OPT) +CFLAGS-$(CONFIG_GCOV) += $(UM_GCOV_OPT) +CFLAGS-$(CONFIG_GPROF) += $(UM_GPROF_OPT) +LINK-$(CONFIG_GCOV) += $(UM_GCOV_OPT) +LINK-$(CONFIG_GPROF) += $(UM_GPROF_OPT) diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig index e543cbac8792..1ffa088739f4 100644 --- a/arch/um/configs/i386_defconfig +++ b/arch/um/configs/i386_defconfig @@ -1,4 +1,3 @@ -CONFIG_3_LEVEL_PGTABLES=y # CONFIG_COMPACTION is not set CONFIG_BINFMT_MISC=m CONFIG_HOSTFS=y @@ -61,7 +60,6 @@ CONFIG_UML_NET_DAEMON=y CONFIG_UML_NET_MCAST=y CONFIG_UML_NET_SLIRP=y CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=y CONFIG_QUOTA=y CONFIG_AUTOFS_FS=m CONFIG_ISO9660_FS=m diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig index 939cb12318ca..03b10d3f6816 100644 --- a/arch/um/configs/x86_64_defconfig +++ b/arch/um/configs/x86_64_defconfig @@ -59,7 +59,6 @@ CONFIG_UML_NET_DAEMON=y CONFIG_UML_NET_MCAST=y CONFIG_UML_NET_SLIRP=y CONFIG_EXT4_FS=y -CONFIG_REISERFS_FS=y CONFIG_QUOTA=y CONFIG_AUTOFS_FS=m CONFIG_ISO9660_FS=m diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index b94b2618e7d8..9cb196070614 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -297,26 +297,6 @@ config UML_NET_MCAST If unsure, say N. -config UML_NET_PCAP - bool "pcap transport (obsolete)" - depends on UML_NET - depends on !MODVERSIONS - select MAY_HAVE_RUNTIME_DEPS - help - The pcap transport makes a pcap packet stream on the host look - like an ethernet device inside UML. This is useful for making - UML act as a network monitor for the host. You must have libcap - installed in order to build the pcap transport into UML. - - For more information, see - <http://user-mode-linux.sourceforge.net/old/networking.html> That site - has examples of the UML command line to use to enable this option. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - config UML_NET_SLIRP bool "SLiRP transport (obsolete)" depends on UML_NET @@ -365,16 +345,20 @@ config UML_RTC by providing a fake RTC clock that causes a wakeup at the right time. -config UML_PCI_OVER_VIRTIO - bool "Enable PCI over VIRTIO device simulation" - # in theory, just VIRTIO is enough, but that causes recursion - depends on VIRTIO_UML +config UML_PCI + bool select FORCE_PCI select UML_IOMEM_EMULATION select UML_DMA_EMULATION select PCI_MSI select PCI_LOCKLESS_CONFIG +config UML_PCI_OVER_VIRTIO + bool "Enable PCI over VIRTIO device simulation" + # in theory, just VIRTIO is enough, but that causes recursion + depends on VIRTIO_UML + select UML_PCI + config UML_PCI_OVER_VIRTIO_DEVICE_ID int "set the virtio device ID for PCI emulation" default -1 diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile index 0e6af81096fd..0a5820343ad3 100644 --- a/arch/um/drivers/Makefile +++ b/arch/um/drivers/Makefile @@ -20,14 +20,9 @@ harddog-objs := harddog_kern.o harddog-builtin-$(CONFIG_UML_WATCHDOG) := harddog_user.o harddog_user_exp.o rtc-objs := rtc_kern.o rtc_user.o -LDFLAGS_pcap.o = $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libpcap.a) - LDFLAGS_vde.o = $(shell $(CC) $(CFLAGS) -print-file-name=libvdeplug.a) -targets := pcap_kern.o pcap_user.o vde_kern.o vde_user.o - -$(obj)/pcap.o: $(obj)/pcap_kern.o $(obj)/pcap_user.o - $(LD) -r -dp -o $@ $^ $(ld_flags) +targets := vde_kern.o vde_user.o $(obj)/vde.o: $(obj)/vde_kern.o $(obj)/vde_user.o $(LD) -r -dp -o $@ $^ $(ld_flags) @@ -49,7 +44,6 @@ obj-$(CONFIG_UML_NET_DAEMON) += daemon.o obj-$(CONFIG_UML_NET_VECTOR) += vector.o obj-$(CONFIG_UML_NET_VDE) += vde.o obj-$(CONFIG_UML_NET_MCAST) += umcast.o -obj-$(CONFIG_UML_NET_PCAP) += pcap.o obj-$(CONFIG_UML_NET) += net.o obj-$(CONFIG_MCONSOLE) += mconsole.o obj-$(CONFIG_MMAPPER) += mmapper_kern.o @@ -66,10 +60,11 @@ obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o obj-$(CONFIG_UML_RANDOM) += random.o obj-$(CONFIG_VIRTIO_UML) += virtio_uml.o obj-$(CONFIG_UML_RTC) += rtc.o -obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virt-pci.o +obj-$(CONFIG_UML_PCI) += virt-pci.o +obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virtio_pcidev.o # pcap_user.o must be added explicitly. -USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o pcap_user.o vde_user.o vector_user.o +USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o vde_user.o vector_user.o CFLAGS_null.o = -DDEV_NULL=$(DEV_NULL_PATH) CFLAGS_xterm.o += '-DCONFIG_XTERM_CHAN_DEFAULT_EMULATOR="$(CONFIG_XTERM_CHAN_DEFAULT_EMULATOR)"' diff --git a/arch/um/drivers/chan.h b/arch/um/drivers/chan.h index e14b9cdf7a33..5a61db512ffb 100644 --- a/arch/um/drivers/chan.h +++ b/arch/um/drivers/chan.h @@ -22,7 +22,8 @@ struct chan { unsigned int output:1; unsigned int opened:1; unsigned int enabled:1; - int fd; + int fd_in; + int fd_out; /* only different to fd_in if blocking output is needed */ const struct chan_ops *ops; void *data; }; diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c index 37538b4168da..e78a99816c86 100644 --- a/arch/um/drivers/chan_kern.c +++ b/arch/um/drivers/chan_kern.c @@ -81,6 +81,12 @@ static const struct chan_ops not_configged_ops = { }; #endif /* CONFIG_NOCONFIG_CHAN */ +static inline bool need_output_blocking(void) +{ + return time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL; +} + static int open_one_chan(struct chan *chan) { int fd, err; @@ -96,15 +102,43 @@ static int open_one_chan(struct chan *chan) return fd; err = os_set_fd_block(fd, 0); - if (err) { - (*chan->ops->close)(fd, chan->data); - return err; - } + if (err) + goto out_close; + + chan->fd_in = fd; + chan->fd_out = fd; + + /* + * In time-travel modes infinite-CPU and external we need to guarantee + * that any writes to the output succeed immdiately from the point of + * the VM. The best way to do this is to put the FD in blocking mode + * and simply wait/retry until everything is written. + * As every write is guaranteed to complete, we also do not need to + * request an IRQ for the output. + * + * Note that input cannot happen in a time synchronized way. We permit + * it, but time passes very quickly if anything waits for a read. + */ + if (chan->output && need_output_blocking()) { + err = os_dup_file(chan->fd_out); + if (err < 0) + goto out_close; - chan->fd = fd; + chan->fd_out = err; + + err = os_set_fd_block(chan->fd_out, 1); + if (err) { + os_close_file(chan->fd_out); + goto out_close; + } + } chan->opened = 1; return 0; + +out_close: + (*chan->ops->close)(fd, chan->data); + return err; } static int open_chan(struct list_head *chans) @@ -125,7 +159,7 @@ static int open_chan(struct list_head *chans) void chan_enable_winch(struct chan *chan, struct tty_port *port) { if (chan && chan->primary && chan->ops->winch) - register_winch(chan->fd, port); + register_winch(chan->fd_in, port); } static void line_timer_cb(struct work_struct *work) @@ -156,8 +190,9 @@ int enable_chan(struct line *line) if (chan->enabled) continue; - err = line_setup_irq(chan->fd, chan->input, chan->output, line, - chan); + err = line_setup_irq(chan->fd_in, chan->input, + chan->output && !need_output_blocking(), + line, chan); if (err) goto out_close; @@ -196,7 +231,8 @@ void free_irqs(void) if (chan->input && chan->enabled) um_free_irq(chan->line->read_irq, chan); - if (chan->output && chan->enabled) + if (chan->output && chan->enabled && + !need_output_blocking()) um_free_irq(chan->line->write_irq, chan); chan->enabled = 0; } @@ -216,15 +252,19 @@ static void close_one_chan(struct chan *chan, int delay_free_irq) } else { if (chan->input && chan->enabled) um_free_irq(chan->line->read_irq, chan); - if (chan->output && chan->enabled) + if (chan->output && chan->enabled && + !need_output_blocking()) um_free_irq(chan->line->write_irq, chan); chan->enabled = 0; } + if (chan->fd_out != chan->fd_in) + os_close_file(chan->fd_out); if (chan->ops->close != NULL) - (*chan->ops->close)(chan->fd, chan->data); + (*chan->ops->close)(chan->fd_in, chan->data); chan->opened = 0; - chan->fd = -1; + chan->fd_in = -1; + chan->fd_out = -1; } void close_chan(struct line *line) @@ -244,7 +284,7 @@ void close_chan(struct line *line) void deactivate_chan(struct chan *chan, int irq) { if (chan && chan->enabled) - deactivate_fd(chan->fd, irq); + deactivate_fd(chan->fd_in, irq); } int write_chan(struct chan *chan, const u8 *buf, size_t len, int write_irq) @@ -254,7 +294,7 @@ int write_chan(struct chan *chan, const u8 *buf, size_t len, int write_irq) if (len == 0 || !chan || !chan->ops->write) return 0; - n = chan->ops->write(chan->fd, buf, len, chan->data); + n = chan->ops->write(chan->fd_out, buf, len, chan->data); if (chan->primary) { ret = n; } @@ -268,7 +308,7 @@ int console_write_chan(struct chan *chan, const char *buf, int len) if (!chan || !chan->ops->console_write) return 0; - n = chan->ops->console_write(chan->fd, buf, len); + n = chan->ops->console_write(chan->fd_out, buf, len); if (chan->primary) ret = n; return ret; @@ -296,14 +336,14 @@ int chan_window_size(struct line *line, unsigned short *rows_out, if (chan && chan->primary) { if (chan->ops->window_size == NULL) return 0; - return chan->ops->window_size(chan->fd, chan->data, + return chan->ops->window_size(chan->fd_in, chan->data, rows_out, cols_out); } chan = line->chan_out; if (chan && chan->primary) { if (chan->ops->window_size == NULL) return 0; - return chan->ops->window_size(chan->fd, chan->data, + return chan->ops->window_size(chan->fd_in, chan->data, rows_out, cols_out); } return 0; @@ -319,7 +359,7 @@ static void free_one_chan(struct chan *chan) (*chan->ops->free)(chan->data); if (chan->primary && chan->output) - ignore_sigio_fd(chan->fd); + ignore_sigio_fd(chan->fd_in); kfree(chan); } @@ -478,7 +518,8 @@ static struct chan *parse_chan(struct line *line, char *str, int device, .output = 0, .opened = 0, .enabled = 0, - .fd = -1, + .fd_in = -1, + .fd_out = -1, .ops = ops, .data = data }); return chan; @@ -549,7 +590,7 @@ void chan_interrupt(struct line *line, int irq) schedule_delayed_work(&line->task, 1); goto out; } - err = chan->ops->read(chan->fd, &c, chan->data); + err = chan->ops->read(chan->fd_in, &c, chan->data); if (err > 0) tty_insert_flip_char(port, c, TTY_NORMAL); } while (err > 0); diff --git a/arch/um/drivers/chan_user.c b/arch/um/drivers/chan_user.c index ec04e47b9d79..35f9beeb19b3 100644 --- a/arch/um/drivers/chan_user.c +++ b/arch/um/drivers/chan_user.c @@ -23,7 +23,7 @@ int generic_read(int fd, __u8 *c_out, void *unused) { int n; - n = read(fd, c_out, sizeof(*c_out)); + CATCH_EINTR(n = read(fd, c_out, sizeof(*c_out))); if (n > 0) return n; else if (n == 0) @@ -37,11 +37,23 @@ int generic_read(int fd, __u8 *c_out, void *unused) int generic_write(int fd, const __u8 *buf, size_t n, void *unused) { + int written = 0; int err; - err = write(fd, buf, n); - if (err > 0) - return err; + /* The FD may be in blocking mode, as such, need to retry short writes, + * they may have been interrupted by a signal. + */ + do { + errno = 0; + err = write(fd, buf + written, n - written); + if (err > 0) { + written += err; + continue; + } + } while (err < 0 && errno == EINTR); + + if (written > 0) + return written; else if (errno == EAGAIN) return 0; else if (err == 0) @@ -149,6 +161,8 @@ static __noreturn int winch_thread(void *arg) int count; char c = 1; + os_set_pdeathsig(); + pty_fd = data->pty_fd; pipe_fd = data->pipe_fd; count = write(pipe_fd, &c, sizeof(c)); diff --git a/arch/um/drivers/harddog_kern.c b/arch/um/drivers/harddog_kern.c index 60d1c6cab8a9..819aabb4ecdc 100644 --- a/arch/um/drivers/harddog_kern.c +++ b/arch/um/drivers/harddog_kern.c @@ -49,6 +49,7 @@ #include "mconsole.h" #include "harddog.h" +MODULE_DESCRIPTION("UML hardware watchdog"); MODULE_LICENSE("GPL"); static DEFINE_MUTEX(harddog_mutex); @@ -163,7 +164,6 @@ static const struct file_operations harddog_fops = { .compat_ioctl = compat_ptr_ioctl, .open = harddog_open, .release = harddog_release, - .llseek = no_llseek, }; static struct miscdevice harddog_miscdev = { diff --git a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c index c42b793bce65..0ac149de1ac0 100644 --- a/arch/um/drivers/hostaudio_kern.c +++ b/arch/um/drivers/hostaudio_kern.c @@ -48,6 +48,7 @@ MODULE_PARM_DESC(mixer, MIXER_HELP); #ifndef MODULE static int set_dsp(char *name, int *add) { + *add = 0; dsp = name; return 0; } @@ -56,6 +57,7 @@ __uml_setup("dsp=", set_dsp, "dsp=<dsp device>\n" DSP_HELP); static int set_mixer(char *name, int *add) { + *add = 0; mixer = name; return 0; } @@ -291,7 +293,6 @@ static int hostmixer_release(struct inode *inode, struct file *file) static const struct file_operations hostaudio_fops = { .owner = THIS_MODULE, - .llseek = no_llseek, .read = hostaudio_read, .write = hostaudio_write, .poll = hostaudio_poll, @@ -304,7 +305,6 @@ static const struct file_operations hostaudio_fops = { static const struct file_operations hostmixer_fops = { .owner = THIS_MODULE, - .llseek = no_llseek, .unlocked_ioctl = hostmixer_ioctl_mixdev, .open = hostmixer_open_mixdev, .release = hostmixer_release, diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c index ffc5cb92fa36..43d8959cc746 100644 --- a/arch/um/drivers/line.c +++ b/arch/um/drivers/line.c @@ -383,6 +383,7 @@ int setup_one_line(struct line *lines, int n, char *init, parse_chan_pair(NULL, line, n, opts, error_out); err = 0; } + *error_out = "configured as 'none'"; } else { char *new = kstrdup(init, GFP_KERNEL); if (!new) { @@ -406,6 +407,7 @@ int setup_one_line(struct line *lines, int n, char *init, } } if (err) { + *error_out = "failed to parse channel pair"; line->init_str = NULL; line->valid = 0; kfree(new); @@ -676,24 +678,26 @@ void register_winch_irq(int fd, int tty_fd, int pid, struct tty_port *port, goto cleanup; } - *winch = ((struct winch) { .list = LIST_HEAD_INIT(winch->list), - .fd = fd, + *winch = ((struct winch) { .fd = fd, .tty_fd = tty_fd, .pid = pid, .port = port, .stack = stack }); + spin_lock(&winch_handler_lock); + list_add(&winch->list, &winch_handlers); + spin_unlock(&winch_handler_lock); + if (um_request_irq(WINCH_IRQ, fd, IRQ_READ, winch_interrupt, IRQF_SHARED, "winch", winch) < 0) { printk(KERN_ERR "register_winch_irq - failed to register " "IRQ\n"); + spin_lock(&winch_handler_lock); + list_del(&winch->list); + spin_unlock(&winch_handler_lock); goto out_free; } - spin_lock(&winch_handler_lock); - list_add(&winch->list, &winch_handlers); - spin_unlock(&winch_handler_lock); - return; out_free: diff --git a/arch/um/drivers/mconsole_user.c b/arch/um/drivers/mconsole_user.c index e24298a734be..a04cd13c6315 100644 --- a/arch/um/drivers/mconsole_user.c +++ b/arch/um/drivers/mconsole_user.c @@ -71,7 +71,9 @@ static struct mconsole_command *mconsole_parse(struct mc_request *req) return NULL; } +#ifndef MIN #define MIN(a,b) ((a)<(b) ? (a):(b)) +#endif #define STRINGX(x) #x #define STRING(x) STRINGX(x) diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index 77c4afb8ab90..d5a9c5aabaec 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -336,7 +336,7 @@ static struct platform_driver uml_net_driver = { static void net_device_release(struct device *dev) { - struct uml_net *device = dev_get_drvdata(dev); + struct uml_net *device = container_of(dev, struct uml_net, pdev.dev); struct net_device *netdev = device->dev; struct uml_net_private *lp = netdev_priv(netdev); @@ -636,10 +636,7 @@ static int __init eth_setup(char *str) return 1; } - new = memblock_alloc(sizeof(*new), SMP_CACHE_BYTES); - if (!new) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*new)); + new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES); INIT_LIST_HEAD(&new->list); new->index = n; diff --git a/arch/um/drivers/pcap_kern.c b/arch/um/drivers/pcap_kern.c deleted file mode 100644 index 25ee2c97ca21..000000000000 --- a/arch/um/drivers/pcap_kern.c +++ /dev/null @@ -1,113 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include <linux/init.h> -#include <linux/netdevice.h> -#include <net_kern.h> -#include "pcap_user.h" - -struct pcap_init { - char *host_if; - int promisc; - int optimize; - char *filter; -}; - -void pcap_init_kern(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct pcap_data *ppri; - struct pcap_init *init = data; - - pri = netdev_priv(dev); - ppri = (struct pcap_data *) pri->user; - ppri->host_if = init->host_if; - ppri->promisc = init->promisc; - ppri->optimize = init->optimize; - ppri->filter = init->filter; - - printk("pcap backend, host interface %s\n", ppri->host_if); -} - -static int pcap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return pcap_user_read(fd, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER, - (struct pcap_data *) &lp->user); -} - -static int pcap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return -EPERM; -} - -static const struct net_kern_info pcap_kern_info = { - .init = pcap_init_kern, - .protocol = eth_protocol, - .read = pcap_read, - .write = pcap_write, -}; - -int pcap_setup(char *str, char **mac_out, void *data) -{ - struct pcap_init *init = data; - char *remain, *host_if = NULL, *options[2] = { NULL, NULL }; - int i; - - *init = ((struct pcap_init) - { .host_if = "eth0", - .promisc = 1, - .optimize = 0, - .filter = NULL }); - - remain = split_if_spec(str, &host_if, &init->filter, - &options[0], &options[1], mac_out, NULL); - if (remain != NULL) { - printk(KERN_ERR "pcap_setup - Extra garbage on " - "specification : '%s'\n", remain); - return 0; - } - - if (host_if != NULL) - init->host_if = host_if; - - for (i = 0; i < ARRAY_SIZE(options); i++) { - if (options[i] == NULL) - continue; - if (!strcmp(options[i], "promisc")) - init->promisc = 1; - else if (!strcmp(options[i], "nopromisc")) - init->promisc = 0; - else if (!strcmp(options[i], "optimize")) - init->optimize = 1; - else if (!strcmp(options[i], "nooptimize")) - init->optimize = 0; - else { - printk(KERN_ERR "pcap_setup : bad option - '%s'\n", - options[i]); - return 0; - } - } - - return 1; -} - -static struct transport pcap_transport = { - .list = LIST_HEAD_INIT(pcap_transport.list), - .name = "pcap", - .setup = pcap_setup, - .user = &pcap_user_info, - .kern = &pcap_kern_info, - .private_size = sizeof(struct pcap_data), - .setup_size = sizeof(struct pcap_init), -}; - -static int register_pcap(void) -{ - register_transport(&pcap_transport); - return 0; -} - -late_initcall(register_pcap); diff --git a/arch/um/drivers/pcap_user.c b/arch/um/drivers/pcap_user.c deleted file mode 100644 index 52ddda3e3b10..000000000000 --- a/arch/um/drivers/pcap_user.c +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include <errno.h> -#include <pcap.h> -#include <string.h> -#include <asm/types.h> -#include <net_user.h> -#include "pcap_user.h" -#include <um_malloc.h> - -#define PCAP_FD(p) (*(int *)(p)) - -static int pcap_user_init(void *data, void *dev) -{ - struct pcap_data *pri = data; - pcap_t *p; - char errors[PCAP_ERRBUF_SIZE]; - - p = pcap_open_live(pri->host_if, ETH_MAX_PACKET + ETH_HEADER_OTHER, - pri->promisc, 0, errors); - if (p == NULL) { - printk(UM_KERN_ERR "pcap_user_init : pcap_open_live failed - " - "'%s'\n", errors); - return -EINVAL; - } - - pri->dev = dev; - pri->pcap = p; - return 0; -} - -static int pcap_user_open(void *data) -{ - struct pcap_data *pri = data; - __u32 netmask; - int err; - - if (pri->pcap == NULL) - return -ENODEV; - - if (pri->filter != NULL) { - err = dev_netmask(pri->dev, &netmask); - if (err < 0) { - printk(UM_KERN_ERR "pcap_user_open : dev_netmask failed\n"); - return -EIO; - } - - pri->compiled = uml_kmalloc(sizeof(struct bpf_program), - UM_GFP_KERNEL); - if (pri->compiled == NULL) { - printk(UM_KERN_ERR "pcap_user_open : kmalloc failed\n"); - return -ENOMEM; - } - - err = pcap_compile(pri->pcap, - (struct bpf_program *) pri->compiled, - pri->filter, pri->optimize, netmask); - if (err < 0) { - printk(UM_KERN_ERR "pcap_user_open : pcap_compile failed - " - "'%s'\n", pcap_geterr(pri->pcap)); - goto out; - } - - err = pcap_setfilter(pri->pcap, pri->compiled); - if (err < 0) { - printk(UM_KERN_ERR "pcap_user_open : pcap_setfilter " - "failed - '%s'\n", pcap_geterr(pri->pcap)); - goto out; - } - } - - return PCAP_FD(pri->pcap); - - out: - kfree(pri->compiled); - return -EIO; -} - -static void pcap_remove(void *data) -{ - struct pcap_data *pri = data; - - if (pri->compiled != NULL) - pcap_freecode(pri->compiled); - - if (pri->pcap != NULL) - pcap_close(pri->pcap); -} - -struct pcap_handler_data { - char *buffer; - int len; -}; - -static void handler(u_char *data, const struct pcap_pkthdr *header, - const u_char *packet) -{ - int len; - - struct pcap_handler_data *hdata = (struct pcap_handler_data *) data; - - len = hdata->len < header->caplen ? hdata->len : header->caplen; - memcpy(hdata->buffer, packet, len); - hdata->len = len; -} - -int pcap_user_read(int fd, void *buffer, int len, struct pcap_data *pri) -{ - struct pcap_handler_data hdata = ((struct pcap_handler_data) - { .buffer = buffer, - .len = len }); - int n; - - n = pcap_dispatch(pri->pcap, 1, handler, (u_char *) &hdata); - if (n < 0) { - printk(UM_KERN_ERR "pcap_dispatch failed - %s\n", - pcap_geterr(pri->pcap)); - return -EIO; - } - else if (n == 0) - return 0; - return hdata.len; -} - -const struct net_user_info pcap_user_info = { - .init = pcap_user_init, - .open = pcap_user_open, - .close = NULL, - .remove = pcap_remove, - .add_address = NULL, - .delete_address = NULL, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; diff --git a/arch/um/drivers/pcap_user.h b/arch/um/drivers/pcap_user.h deleted file mode 100644 index 216246f5f09b..000000000000 --- a/arch/um/drivers/pcap_user.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - */ - -#include <net_user.h> - -struct pcap_data { - char *host_if; - int promisc; - int optimize; - char *filter; - void *compiled; - void *pcap; - void *dev; -}; - -extern const struct net_user_info pcap_user_info; - -extern int pcap_user_read(int fd, void *buf, int len, struct pcap_data *pri); - diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c index c52b3ff3c092..a4508470df78 100644 --- a/arch/um/drivers/port_kern.c +++ b/arch/um/drivers/port_kern.c @@ -45,15 +45,17 @@ struct connection { static irqreturn_t pipe_interrupt(int irq, void *data) { struct connection *conn = data; - int fd; + int n_fds = 1, fd = -1; + ssize_t ret; - fd = os_rcv_fd(conn->socket[0], &conn->helper_pid); - if (fd < 0) { - if (fd == -EAGAIN) + ret = os_rcv_fd_msg(conn->socket[0], &fd, n_fds, &conn->helper_pid, + sizeof(conn->helper_pid)); + if (ret != sizeof(conn->helper_pid)) { + if (ret == -EAGAIN) return IRQ_NONE; - printk(KERN_ERR "pipe_interrupt : os_rcv_fd returned %d\n", - -fd); + printk(KERN_ERR "pipe_interrupt : os_rcv_fd_msg returned %zd\n", + ret); os_close_file(conn->fd); } diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index da985e0dc69a..ca08c91f47a3 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -79,7 +79,7 @@ static int __init rng_init (void) if (err < 0) goto err_out_cleanup_hw; - sigio_broken(random_fd); + sigio_broken(); hwrng.name = RNG_MODULE_NAME; hwrng.read = rng_dev_read; diff --git a/arch/um/drivers/rtc_kern.c b/arch/um/drivers/rtc_kern.c index 97ceb205cfe6..9158c936c128 100644 --- a/arch/um/drivers/rtc_kern.c +++ b/arch/um/drivers/rtc_kern.c @@ -51,6 +51,7 @@ static int uml_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) static int uml_rtc_alarm_irq_enable(struct device *dev, unsigned int enable) { + struct timespec64 ts; unsigned long long secs; if (!enable && !uml_rtc_alarm_enabled) @@ -58,7 +59,8 @@ static int uml_rtc_alarm_irq_enable(struct device *dev, unsigned int enable) uml_rtc_alarm_enabled = enable; - secs = uml_rtc_alarm_time - ktime_get_real_seconds(); + read_persistent_clock64(&ts); + secs = uml_rtc_alarm_time - ts.tv_sec; if (time_travel_mode == TT_MODE_OFF) { if (!enable) { @@ -73,7 +75,8 @@ static int uml_rtc_alarm_irq_enable(struct device *dev, unsigned int enable) if (enable) time_travel_add_event_rel(¨_rtc_alarm_event, - secs * NSEC_PER_SEC); + secs * NSEC_PER_SEC - + ts.tv_nsec); } return 0; @@ -168,11 +171,10 @@ cleanup: return err; } -static int uml_rtc_remove(struct platform_device *pdev) +static void uml_rtc_remove(struct platform_device *pdev) { device_init_wakeup(&pdev->dev, 0); uml_rtc_cleanup(); - return 0; } static struct platform_driver uml_rtc_driver = { diff --git a/arch/um/drivers/rtc_user.c b/arch/um/drivers/rtc_user.c index 7c3cec4c68cf..51e79f3148cd 100644 --- a/arch/um/drivers/rtc_user.c +++ b/arch/um/drivers/rtc_user.c @@ -39,7 +39,7 @@ int uml_rtc_start(bool timetravel) } /* apparently timerfd won't send SIGIO, use workaround */ - sigio_broken(uml_rtc_irq_fds[0]); + sigio_broken(); err = add_sigio_fd(uml_rtc_irq_fds[0]); if (err < 0) { close(uml_rtc_irq_fds[0]); diff --git a/arch/um/drivers/slirp_user.c b/arch/um/drivers/slirp_user.c index 8f633e2e5f3d..97228aa080cb 100644 --- a/arch/um/drivers/slirp_user.c +++ b/arch/um/drivers/slirp_user.c @@ -49,7 +49,7 @@ static int slirp_tramp(char **argv, int fd) static int slirp_open(void *data) { struct slirp_data *pri = data; - int fds[2], pid, err; + int fds[2], err; err = os_pipe(fds, 1, 1); if (err) @@ -60,7 +60,6 @@ static int slirp_open(void *data) printk(UM_KERN_ERR "slirp_tramp failed - errno = %d\n", -err); goto out; } - pid = err; pri->slave = fds[1]; pri->slip.pos = 0; diff --git a/arch/um/drivers/ubd.h b/arch/um/drivers/ubd.h index f016fe15499f..2985c14661f4 100644 --- a/arch/um/drivers/ubd.h +++ b/arch/um/drivers/ubd.h @@ -7,8 +7,10 @@ #ifndef __UM_UBD_USER_H #define __UM_UBD_USER_H -extern int start_io_thread(unsigned long sp, int *fds_out); -extern int io_thread(void *arg); +#include <os.h> + +int start_io_thread(struct os_helper_thread **td_out, int *fd_out); +void *io_thread(void *arg); extern int kernel_fd; extern int ubd_read_poll(int timeout); diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 63fc062add70..4de6613e7468 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -36,7 +36,6 @@ #include <linux/vmalloc.h> #include <linux/platform_device.h> #include <linux/scatterlist.h> -#include <asm/tlbflush.h> #include <kern_util.h> #include "mconsole_kern.h" #include <init.h> @@ -106,7 +105,6 @@ static inline void ubd_set_bit(__u64 bit, unsigned char *data) #define DRIVER_NAME "uml-blkdev" static DEFINE_MUTEX(ubd_lock); -static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */ static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); @@ -447,53 +445,41 @@ static int bulk_req_safe_read( return n; } -/* Called without dev->lock held, and only in interrupt context. */ -static void ubd_handler(void) +static void ubd_end_request(struct io_thread_req *io_req) { - int n; - int count; - - while(1){ - n = bulk_req_safe_read( - thread_fd, - irq_req_buffer, - &irq_remainder, - &irq_remainder_size, - UBD_REQ_BUFFER_SIZE - ); - if (n < 0) { - if(n == -EAGAIN) - break; - printk(KERN_ERR "spurious interrupt in ubd_handler, " - "err = %d\n", -n); - return; - } - for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { - struct io_thread_req *io_req = (*irq_req_buffer)[count]; - - if ((io_req->error == BLK_STS_NOTSUPP) && (req_op(io_req->req) == REQ_OP_DISCARD)) { - blk_queue_max_discard_sectors(io_req->req->q, 0); - blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); - } - blk_mq_end_request(io_req->req, io_req->error); - kfree(io_req); - } + if (io_req->error == BLK_STS_NOTSUPP) { + if (req_op(io_req->req) == REQ_OP_DISCARD) + blk_queue_disable_discard(io_req->req->q); + else if (req_op(io_req->req) == REQ_OP_WRITE_ZEROES) + blk_queue_disable_write_zeroes(io_req->req->q); } + blk_mq_end_request(io_req->req, io_req->error); + kfree(io_req); } static irqreturn_t ubd_intr(int irq, void *dev) { - ubd_handler(); + int len, i; + + while ((len = bulk_req_safe_read(thread_fd, irq_req_buffer, + &irq_remainder, &irq_remainder_size, + UBD_REQ_BUFFER_SIZE)) >= 0) { + for (i = 0; i < len / sizeof(struct io_thread_req *); i++) + ubd_end_request((*irq_req_buffer)[i]); + } + + if (len < 0 && len != -EAGAIN) + pr_err("spurious interrupt in %s, err = %d\n", __func__, len); return IRQ_HANDLED; } /* Only changed by ubd_init, which is an initcall. */ -static int io_pid = -1; +static struct os_helper_thread *io_td; static void kill_io_thread(void) { - if(io_pid != -1) - os_kill_process(io_pid, 1); + if (io_td) + os_kill_helper_thread(io_td); } __uml_exitcall(kill_io_thread); @@ -771,7 +757,6 @@ static int ubd_open_dev(struct ubd *ubd_dev) printk(KERN_ERR "Failed to vmalloc COW bitmap\n"); goto error; } - flush_tlb_kernel_vm(); err = read_cow_bitmap(ubd_dev->fd, ubd_dev->cow.bitmap, ubd_dev->cow.bitmap_offset, @@ -794,7 +779,7 @@ static int ubd_open_dev(struct ubd *ubd_dev) static void ubd_device_release(struct device *dev) { - struct ubd *ubd_dev = dev_get_drvdata(dev); + struct ubd *ubd_dev = container_of(dev, struct ubd, pdev.dev); blk_mq_free_tag_set(&ubd_dev->tag_set); *ubd_dev = ((struct ubd) DEFAULT_UBD); @@ -847,6 +832,7 @@ static int ubd_add(int n, char **error_out) struct queue_limits lim = { .max_segments = MAX_SG, .seg_boundary_mask = PAGE_SIZE - 1, + .features = BLK_FEAT_WRITE_CACHE, }; struct gendisk *disk; int err = 0; @@ -879,7 +865,6 @@ static int ubd_add(int n, char **error_out) ubd_dev->tag_set.ops = &ubd_mq_ops; ubd_dev->tag_set.queue_depth = 64; ubd_dev->tag_set.numa_node = NUMA_NO_NODE; - ubd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ubd_dev->tag_set.driver_data = ubd_dev; ubd_dev->tag_set.nr_hw_queues = 1; @@ -893,8 +878,6 @@ static int ubd_add(int n, char **error_out) goto out_cleanup_tags; } - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_write_cache(disk->queue, true, false); disk->major = UBD_MAJOR; disk->first_minor = n << UBD_SHIFT; disk->minors = 1 << UBD_SHIFT; @@ -914,6 +897,8 @@ static int ubd_add(int n, char **error_out) if (err) goto out_cleanup_disk; + ubd_dev->disk = disk; + return 0; out_cleanup_disk: @@ -1092,7 +1077,7 @@ static int __init ubd_init(void) if (irq_req_buffer == NULL) { printk(KERN_ERR "Failed to initialize ubd buffering\n"); - return -1; + return -ENOMEM; } io_req_buffer = kmalloc_array(UBD_REQ_BUFFER_SIZE, sizeof(struct io_thread_req *), @@ -1103,7 +1088,7 @@ static int __init ubd_init(void) if (io_req_buffer == NULL) { printk(KERN_ERR "Failed to initialize ubd buffering\n"); - return -1; + return -ENOMEM; } platform_driver_register(&ubd_driver); mutex_lock(&ubd_lock); @@ -1119,8 +1104,8 @@ static int __init ubd_init(void) late_initcall(ubd_init); -static int __init ubd_driver_init(void){ - unsigned long stack; +static int __init ubd_driver_init(void) +{ int err; /* Set by CONFIG_BLK_DEV_UBD_SYNC or ubd=sync.*/ @@ -1129,13 +1114,11 @@ static int __init ubd_driver_init(void){ /* Letting ubd=sync be like using ubd#s= instead of ubd#= is * enough. So use anyway the io thread. */ } - stack = alloc_stack(0, 0); - io_pid = start_io_thread(stack + PAGE_SIZE, &thread_fd); - if(io_pid < 0){ + err = start_io_thread(&io_td, &thread_fd); + if (err < 0) { printk(KERN_ERR "ubd : Failed to start I/O thread (errno = %d) - " - "falling back to synchronous I/O\n", -io_pid); - io_pid = -1; + "falling back to synchronous I/O\n", -err); return 0; } err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr, @@ -1511,11 +1494,11 @@ int kernel_fd = -1; /* Only changed by the io thread. XXX: currently unused. */ static int io_count; -int io_thread(void *arg) +void *io_thread(void *arg) { int n, count, written, res; - os_fix_helper_signals(); + os_fix_helper_thread_signals(); while(1){ n = bulk_req_safe_read( @@ -1557,5 +1540,5 @@ int io_thread(void *arg) } while (written < n); } - return 0; + return NULL; } diff --git a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c index a1afe414ce48..c5e6545f6fcf 100644 --- a/arch/um/drivers/ubd_user.c +++ b/arch/um/drivers/ubd_user.c @@ -23,11 +23,11 @@ #include <os.h> #include <poll.h> -struct pollfd kernel_pollfd; +static struct pollfd kernel_pollfd; -int start_io_thread(unsigned long sp, int *fd_out) +int start_io_thread(struct os_helper_thread **td_out, int *fd_out) { - int pid, fds[2], err; + int fds[2], err; err = os_pipe(fds, 1, 1); if(err < 0){ @@ -47,14 +47,14 @@ int start_io_thread(unsigned long sp, int *fd_out) goto out_close; } - pid = clone(io_thread, (void *) sp, CLONE_FILES | CLONE_VM, NULL); - if(pid < 0){ - err = -errno; - printk("start_io_thread - clone failed : errno = %d\n", errno); + err = os_run_helper_thread(td_out, io_thread, NULL); + if (err < 0) { + printk("%s - failed to run helper thread, err = %d\n", + __func__, -err); goto out_close; } - return(pid); + return 0; out_close: os_close_file(fds[0]); diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index dc2feae789cb..b97bb52dd562 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -22,6 +22,7 @@ #include <linux/interrupt.h> #include <linux/firmware.h> #include <linux/fs.h> +#include <asm/atomic.h> #include <uapi/linux/filter.h> #include <init.h> #include <irq_kern.h> @@ -102,18 +103,33 @@ static const struct { static void vector_reset_stats(struct vector_private *vp) { + /* We reuse the existing queue locks for stats */ + + /* RX stats are modified with RX head_lock held + * in vector_poll. + */ + + spin_lock(&vp->rx_queue->head_lock); vp->estats.rx_queue_max = 0; vp->estats.rx_queue_running_average = 0; - vp->estats.tx_queue_max = 0; - vp->estats.tx_queue_running_average = 0; vp->estats.rx_encaps_errors = 0; + vp->estats.sg_ok = 0; + vp->estats.sg_linearized = 0; + spin_unlock(&vp->rx_queue->head_lock); + + /* TX stats are modified with TX head_lock held + * in vector_send. + */ + + spin_lock(&vp->tx_queue->head_lock); vp->estats.tx_timeout_count = 0; vp->estats.tx_restart_queue = 0; vp->estats.tx_kicks = 0; vp->estats.tx_flow_control_xon = 0; vp->estats.tx_flow_control_xoff = 0; - vp->estats.sg_ok = 0; - vp->estats.sg_linearized = 0; + vp->estats.tx_queue_max = 0; + vp->estats.tx_queue_running_average = 0; + spin_unlock(&vp->tx_queue->head_lock); } static int get_mtu(struct arglist *def) @@ -141,7 +157,7 @@ static bool get_bpf_flash(struct arglist *def) if (allow != NULL) { if (kstrtoul(allow, 10, &result) == 0) - return (allow > 0); + return result > 0; } return false; } @@ -232,12 +248,6 @@ static int get_transport_options(struct arglist *def) static char *drop_buffer; -/* Array backed queues optimized for bulk enqueue/dequeue and - * 1:N (small values of N) or 1:1 enqueuer/dequeuer ratios. - * For more details and full design rationale see - * http://foswiki.cambridgegreys.com/Main/EatYourTailAndEnjoyIt - */ - /* * Advance the mmsg queue head by n = advance. Resets the queue to @@ -247,27 +257,13 @@ static char *drop_buffer; static int vector_advancehead(struct vector_queue *qi, int advance) { - int queue_depth; - qi->head = (qi->head + advance) % qi->max_depth; - spin_lock(&qi->tail_lock); - qi->queue_depth -= advance; - - /* we are at 0, use this to - * reset head and tail so we can use max size vectors - */ - - if (qi->queue_depth == 0) { - qi->head = 0; - qi->tail = 0; - } - queue_depth = qi->queue_depth; - spin_unlock(&qi->tail_lock); - return queue_depth; + atomic_sub(advance, &qi->queue_depth); + return atomic_read(&qi->queue_depth); } /* Advance the queue tail by n = advance. @@ -277,16 +273,11 @@ static int vector_advancehead(struct vector_queue *qi, int advance) static int vector_advancetail(struct vector_queue *qi, int advance) { - int queue_depth; - qi->tail = (qi->tail + advance) % qi->max_depth; - spin_lock(&qi->head_lock); - qi->queue_depth += advance; - queue_depth = qi->queue_depth; - spin_unlock(&qi->head_lock); - return queue_depth; + atomic_add(advance, &qi->queue_depth); + return atomic_read(&qi->queue_depth); } static int prep_msg(struct vector_private *vp, @@ -339,9 +330,7 @@ static int vector_enqueue(struct vector_queue *qi, struct sk_buff *skb) int iov_count; spin_lock(&qi->tail_lock); - spin_lock(&qi->head_lock); - queue_depth = qi->queue_depth; - spin_unlock(&qi->head_lock); + queue_depth = atomic_read(&qi->queue_depth); if (skb) packet_len = skb->len; @@ -360,6 +349,7 @@ static int vector_enqueue(struct vector_queue *qi, struct sk_buff *skb) mmsg_vector->msg_hdr.msg_iovlen = iov_count; mmsg_vector->msg_hdr.msg_name = vp->fds->remote_addr; mmsg_vector->msg_hdr.msg_namelen = vp->fds->remote_addr_size; + wmb(); /* Make the packet visible to the NAPI poll thread */ queue_depth = vector_advancetail(qi, 1); } else goto drop; @@ -398,7 +388,7 @@ static int consume_vector_skbs(struct vector_queue *qi, int count) } /* - * Generic vector deque via sendmmsg with support for forming headers + * Generic vector dequeue via sendmmsg with support for forming headers * using transport specific callback. Allows GRE, L2TPv3, RAW and * other transports to use a common dequeue procedure in vector mode */ @@ -408,69 +398,64 @@ static int vector_send(struct vector_queue *qi) { struct vector_private *vp = netdev_priv(qi->dev); struct mmsghdr *send_from; - int result = 0, send_len, queue_depth = qi->max_depth; + int result = 0, send_len; if (spin_trylock(&qi->head_lock)) { - if (spin_trylock(&qi->tail_lock)) { - /* update queue_depth to current value */ - queue_depth = qi->queue_depth; - spin_unlock(&qi->tail_lock); - while (queue_depth > 0) { - /* Calculate the start of the vector */ - send_len = queue_depth; - send_from = qi->mmsg_vector; - send_from += qi->head; - /* Adjust vector size if wraparound */ - if (send_len + qi->head > qi->max_depth) - send_len = qi->max_depth - qi->head; - /* Try to TX as many packets as possible */ - if (send_len > 0) { - result = uml_vector_sendmmsg( - vp->fds->tx_fd, - send_from, - send_len, - 0 - ); - vp->in_write_poll = - (result != send_len); - } - /* For some of the sendmmsg error scenarios - * we may end being unsure in the TX success - * for all packets. It is safer to declare - * them all TX-ed and blame the network. - */ - if (result < 0) { - if (net_ratelimit()) - netdev_err(vp->dev, "sendmmsg err=%i\n", - result); - vp->in_error = true; - result = send_len; - } - if (result > 0) { - queue_depth = - consume_vector_skbs(qi, result); - /* This is equivalent to an TX IRQ. - * Restart the upper layers to feed us - * more packets. - */ - if (result > vp->estats.tx_queue_max) - vp->estats.tx_queue_max = result; - vp->estats.tx_queue_running_average = - (vp->estats.tx_queue_running_average + result) >> 1; - } - netif_wake_queue(qi->dev); - /* if TX is busy, break out of the send loop, - * poll write IRQ will reschedule xmit for us + /* update queue_depth to current value */ + while (atomic_read(&qi->queue_depth) > 0) { + /* Calculate the start of the vector */ + send_len = atomic_read(&qi->queue_depth); + send_from = qi->mmsg_vector; + send_from += qi->head; + /* Adjust vector size if wraparound */ + if (send_len + qi->head > qi->max_depth) + send_len = qi->max_depth - qi->head; + /* Try to TX as many packets as possible */ + if (send_len > 0) { + result = uml_vector_sendmmsg( + vp->fds->tx_fd, + send_from, + send_len, + 0 + ); + vp->in_write_poll = + (result != send_len); + } + /* For some of the sendmmsg error scenarios + * we may end being unsure in the TX success + * for all packets. It is safer to declare + * them all TX-ed and blame the network. + */ + if (result < 0) { + if (net_ratelimit()) + netdev_err(vp->dev, "sendmmsg err=%i\n", + result); + vp->in_error = true; + result = send_len; + } + if (result > 0) { + consume_vector_skbs(qi, result); + /* This is equivalent to an TX IRQ. + * Restart the upper layers to feed us + * more packets. */ - if (result != send_len) { - vp->estats.tx_restart_queue++; - break; - } + if (result > vp->estats.tx_queue_max) + vp->estats.tx_queue_max = result; + vp->estats.tx_queue_running_average = + (vp->estats.tx_queue_running_average + result) >> 1; + } + netif_wake_queue(qi->dev); + /* if TX is busy, break out of the send loop, + * poll write IRQ will reschedule xmit for us. + */ + if (result != send_len) { + vp->estats.tx_restart_queue++; + break; } } spin_unlock(&qi->head_lock); } - return queue_depth; + return atomic_read(&qi->queue_depth); } /* Queue destructor. Deliberately stateless so we can use @@ -589,7 +574,7 @@ static struct vector_queue *create_queue( } spin_lock_init(&result->head_lock); spin_lock_init(&result->tail_lock); - result->queue_depth = 0; + atomic_set(&result->queue_depth, 0); result->head = 0; result->tail = 0; return result; @@ -668,18 +653,27 @@ done: } -/* Prepare queue for recvmmsg one-shot rx - fill with fresh sk_buffs*/ +/* Prepare queue for recvmmsg one-shot rx - fill with fresh sk_buffs */ static void prep_queue_for_rx(struct vector_queue *qi) { struct vector_private *vp = netdev_priv(qi->dev); struct mmsghdr *mmsg_vector = qi->mmsg_vector; void **skbuff_vector = qi->skbuff_vector; - int i; + int i, queue_depth; + + queue_depth = atomic_read(&qi->queue_depth); - if (qi->queue_depth == 0) + if (queue_depth == 0) return; - for (i = 0; i < qi->queue_depth; i++) { + + /* RX is always emptied 100% during each cycle, so we do not + * have to do the tail wraparound math for it. + */ + + qi->head = qi->tail = 0; + + for (i = 0; i < queue_depth; i++) { /* it is OK if allocation fails - recvmmsg with NULL data in * iov argument still performs an RX, just drops the packet * This allows us stop faffing around with a "drop buffer" @@ -689,7 +683,7 @@ static void prep_queue_for_rx(struct vector_queue *qi) skbuff_vector++; mmsg_vector++; } - qi->queue_depth = 0; + atomic_set(&qi->queue_depth, 0); } static struct vector_device *find_device(int n) @@ -712,11 +706,9 @@ static struct vector_device *find_device(int n) static int vector_parse(char *str, int *index_out, char **str_out, char **error_out) { - int n, len, err; + int n, err; char *start = str; - len = strlen(str); - while ((*str != ':') && (strlen(str) > 1)) str++; if (*str != ':') { @@ -823,7 +815,8 @@ static struct platform_driver uml_net_driver = { static void vector_device_release(struct device *dev) { - struct vector_device *device = dev_get_drvdata(dev); + struct vector_device *device = + container_of(dev, struct vector_device, pdev.dev); struct net_device *netdev = device->dev; list_del(&device->list); @@ -974,7 +967,7 @@ static int vector_mmsg_rx(struct vector_private *vp, int budget) budget = qi->max_depth; packet_count = uml_vector_recvmmsg( - vp->fds->rx_fd, qi->mmsg_vector, qi->max_depth, 0); + vp->fds->rx_fd, qi->mmsg_vector, budget, 0); if (packet_count < 0) vp->in_error = true; @@ -987,7 +980,7 @@ static int vector_mmsg_rx(struct vector_private *vp, int budget) * many do we need to prep the next time prep_queue_for_rx() is called. */ - qi->queue_depth = packet_count; + atomic_add(packet_count, &qi->queue_depth); for (i = 0; i < packet_count; i++) { skb = (*skbuff_vector); @@ -1117,10 +1110,11 @@ static int irq_rr; static int vector_net_close(struct net_device *dev) { struct vector_private *vp = netdev_priv(dev); - unsigned long flags; netif_stop_queue(dev); - del_timer(&vp->tl); + timer_delete(&vp->tl); + + vp->opened = false; if (vp->fds == NULL) return 0; @@ -1160,10 +1154,7 @@ static int vector_net_close(struct net_device *dev) destroy_queue(vp->tx_queue); kfree(vp->fds); vp->fds = NULL; - spin_lock_irqsave(&vp->lock, flags); - vp->opened = false; vp->in_error = false; - spin_unlock_irqrestore(&vp->lock, flags); return 0; } @@ -1176,6 +1167,7 @@ static int vector_poll(struct napi_struct *napi, int budget) if ((vp->options & VECTOR_TX) != 0) tx_enqueued = (vector_send(vp->tx_queue) > 0); + spin_lock(&vp->rx_queue->head_lock); if ((vp->options & VECTOR_RX) > 0) err = vector_mmsg_rx(vp, budget); else { @@ -1183,12 +1175,13 @@ static int vector_poll(struct napi_struct *napi, int budget) if (err > 0) err = 1; } + spin_unlock(&vp->rx_queue->head_lock); if (err > 0) work_done += err; if (tx_enqueued || err > 0) napi_schedule(napi); - if (work_done < budget) + if (work_done <= budget) napi_complete_done(napi, work_done); return work_done; } @@ -1205,17 +1198,12 @@ static void vector_reset_tx(struct work_struct *work) static int vector_net_open(struct net_device *dev) { struct vector_private *vp = netdev_priv(dev); - unsigned long flags; int err = -EINVAL; struct vector_device *vdevice; - spin_lock_irqsave(&vp->lock, flags); - if (vp->opened) { - spin_unlock_irqrestore(&vp->lock, flags); + if (vp->opened) return -ENXIO; - } vp->opened = true; - spin_unlock_irqrestore(&vp->lock, flags); vp->bpf = uml_vector_user_bpf(get_bpf_file(vp->parsed)); @@ -1234,7 +1222,7 @@ static int vector_net_open(struct net_device *dev) vp->rx_header_size, MAX_IOV_SIZE ); - vp->rx_queue->queue_depth = get_depth(vp->parsed); + atomic_set(&vp->rx_queue->queue_depth, get_depth(vp->parsed)); } else { vp->header_rxbuffer = kmalloc( vp->rx_header_size, @@ -1389,8 +1377,6 @@ static int vector_net_load_bpf_flash(struct net_device *dev, return -1; } - spin_lock(&vp->lock); - if (vp->bpf != NULL) { if (vp->opened) uml_vector_detach_bpf(vp->fds->rx_fd, vp->bpf); @@ -1419,15 +1405,12 @@ static int vector_net_load_bpf_flash(struct net_device *dev, if (vp->opened) result = uml_vector_attach_bpf(vp->fds->rx_fd, vp->bpf); - spin_unlock(&vp->lock); - return result; free_buffer: release_firmware(fw); flash_fail: - spin_unlock(&vp->lock); if (vp->bpf != NULL) kfree(vp->bpf->filter); kfree(vp->bpf); @@ -1481,7 +1464,17 @@ static void vector_get_ethtool_stats(struct net_device *dev, { struct vector_private *vp = netdev_priv(dev); + /* Stats are modified in the dequeue portions of + * rx/tx which are protected by the head locks + * grabbing these locks here ensures they are up + * to date. + */ + + spin_lock(&vp->tx_queue->head_lock); + spin_lock(&vp->rx_queue->head_lock); memcpy(tmp_stats, &vp->estats, sizeof(struct vector_estats)); + spin_unlock(&vp->rx_queue->head_lock); + spin_unlock(&vp->tx_queue->head_lock); } static int vector_get_coalesce(struct net_device *netdev, @@ -1633,7 +1626,6 @@ static void vector_eth_configure( INIT_WORK(&vp->reset_tx, vector_reset_tx); timer_setup(&vp->tl, vector_timer_expire, 0); - spin_lock_init(&vp->lock); /* FIXME */ dev->netdev_ops = &vector_netdev_ops; @@ -1702,10 +1694,7 @@ static int __init vector_setup(char *str) str, error); return 1; } - new = memblock_alloc(sizeof(*new), SMP_CACHE_BYTES); - if (!new) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*new)); + new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES); INIT_LIST_HEAD(&new->list); new->unit = n; new->arguments = str; diff --git a/arch/um/drivers/vector_kern.h b/arch/um/drivers/vector_kern.h index 2a1fa8e0f3e1..417834793658 100644 --- a/arch/um/drivers/vector_kern.h +++ b/arch/um/drivers/vector_kern.h @@ -14,6 +14,7 @@ #include <linux/ctype.h> #include <linux/workqueue.h> #include <linux/interrupt.h> +#include <asm/atomic.h> #include "vector_user.h" @@ -44,7 +45,8 @@ struct vector_queue { struct net_device *dev; spinlock_t head_lock; spinlock_t tail_lock; - int queue_depth, head, tail, max_depth, max_iov_frags; + atomic_t queue_depth; + int head, tail, max_depth, max_iov_frags; short options; }; @@ -71,7 +73,6 @@ struct vector_estats { struct vector_private { struct list_head list; - spinlock_t lock; struct net_device *dev; struct napi_struct napi ____cacheline_aligned; diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c index b16a5e5619d3..2ea67e6fd067 100644 --- a/arch/um/drivers/vector_user.c +++ b/arch/um/drivers/vector_user.c @@ -46,6 +46,9 @@ #define TRANS_FD "fd" #define TRANS_FD_LEN strlen(TRANS_FD) +#define TRANS_VDE "vde" +#define TRANS_VDE_LEN strlen(TRANS_VDE) + #define VNET_HDR_FAIL "could not enable vnet headers on fd %d" #define TUN_GET_F_FAIL "tapraw: TUNGETFEATURES failed: %s" #define L2TPV3_BIND_FAIL "l2tpv3_open : could not bind socket err=%i" @@ -434,6 +437,84 @@ fd_cleanup: return NULL; } +/* enough char to store an int type */ +#define ENOUGH(type) ((CHAR_BIT * sizeof(type) - 1) / 3 + 2) +#define ENOUGH_OCTAL(type) ((CHAR_BIT * sizeof(type) + 2) / 3) +/* vde_plug --descr xx --port2 xx --mod2 xx --group2 xx seqpacket://NN vnl (NULL) */ +#define VDE_MAX_ARGC 12 +#define VDE_SEQPACKET_HEAD "seqpacket://" +#define VDE_SEQPACKET_HEAD_LEN (sizeof(VDE_SEQPACKET_HEAD) - 1) +#define VDE_DEFAULT_DESCRIPTION "UML" + +static struct vector_fds *user_init_vde_fds(struct arglist *ifspec) +{ + char seqpacketvnl[VDE_SEQPACKET_HEAD_LEN + ENOUGH(int) + 1]; + char *argv[VDE_MAX_ARGC] = {"vde_plug"}; + int argc = 1; + int rv; + int sv[2]; + struct vector_fds *result = NULL; + + char *vnl = uml_vector_fetch_arg(ifspec,"vnl"); + char *descr = uml_vector_fetch_arg(ifspec,"descr"); + char *port = uml_vector_fetch_arg(ifspec,"port"); + char *mode = uml_vector_fetch_arg(ifspec,"mode"); + char *group = uml_vector_fetch_arg(ifspec,"group"); + if (descr == NULL) descr = VDE_DEFAULT_DESCRIPTION; + + argv[argc++] = "--descr"; + argv[argc++] = descr; + if (port != NULL) { + argv[argc++] = "--port2"; + argv[argc++] = port; + } + if (mode != NULL) { + argv[argc++] = "--mod2"; + argv[argc++] = mode; + } + if (group != NULL) { + argv[argc++] = "--group2"; + argv[argc++] = group; + } + argv[argc++] = seqpacketvnl; + argv[argc++] = vnl; + argv[argc++] = NULL; + + rv = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, sv); + if (rv < 0) { + printk(UM_KERN_ERR "vde: seqpacket socketpair err %d", -errno); + return NULL; + } + rv = os_set_exec_close(sv[0]); + if (rv < 0) { + printk(UM_KERN_ERR "vde: seqpacket socketpair cloexec err %d", -errno); + goto vde_cleanup_sv; + } + snprintf(seqpacketvnl, sizeof(seqpacketvnl), VDE_SEQPACKET_HEAD "%d", sv[1]); + + run_helper(NULL, NULL, argv); + + close(sv[1]); + + result = uml_kmalloc(sizeof(struct vector_fds), UM_GFP_KERNEL); + if (result == NULL) { + printk(UM_KERN_ERR "fd open: allocation failed"); + goto vde_cleanup; + } + + result->rx_fd = sv[0]; + result->tx_fd = sv[0]; + result->remote_addr_size = 0; + result->remote_addr = NULL; + return result; + +vde_cleanup_sv: + close(sv[1]); +vde_cleanup: + close(sv[0]); + return NULL; +} + static struct vector_fds *user_init_raw_fds(struct arglist *ifspec) { int rxfd = -1, txfd = -1; @@ -673,6 +754,8 @@ struct vector_fds *uml_vector_user_open( return user_init_unix_fds(parsed, ID_BESS); if (strncmp(transport, TRANS_FD, TRANS_FD_LEN) == 0) return user_init_fd_fds(parsed); + if (strncmp(transport, TRANS_VDE, TRANS_VDE_LEN) == 0) + return user_init_vde_fds(parsed); return NULL; } diff --git a/arch/um/drivers/vhost_user.h b/arch/um/drivers/vhost_user.h index 6f147cd3c9f7..fcfa3b7e021b 100644 --- a/arch/um/drivers/vhost_user.h +++ b/arch/um/drivers/vhost_user.h @@ -10,6 +10,7 @@ /* Feature bits */ #define VHOST_USER_F_PROTOCOL_FEATURES 30 /* Protocol feature bits */ +#define VHOST_USER_PROTOCOL_F_MQ 0 #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 #define VHOST_USER_PROTOCOL_F_CONFIG 9 @@ -23,7 +24,8 @@ /* Supported transport features */ #define VHOST_USER_SUPPORTED_F BIT_ULL(VHOST_USER_F_PROTOCOL_FEATURES) /* Supported protocol features */ -#define VHOST_USER_SUPPORTED_PROTOCOL_F (BIT_ULL(VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ +#define VHOST_USER_SUPPORTED_PROTOCOL_F (BIT_ULL(VHOST_USER_PROTOCOL_F_MQ) | \ + BIT_ULL(VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \ BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG) | \ BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS)) diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index 97a37c062997..b83b5a765d4e 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -5,45 +5,19 @@ */ #include <linux/module.h> #include <linux/pci.h> -#include <linux/virtio.h> -#include <linux/virtio_config.h> #include <linux/logic_iomem.h> #include <linux/of_platform.h> #include <linux/irqdomain.h> -#include <linux/virtio_pcidev.h> -#include <linux/virtio-uml.h> -#include <linux/delay.h> #include <linux/msi.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> #include <irq_kern.h> +#include "virt-pci.h" + #define MAX_DEVICES 8 #define MAX_MSI_VECTORS 32 #define CFG_SPACE_SIZE 4096 -/* for MSI-X we have a 32-bit payload */ -#define MAX_IRQ_MSG_SIZE (sizeof(struct virtio_pcidev_msg) + sizeof(u32)) -#define NUM_IRQ_MSGS 10 - -#define HANDLE_NO_FREE(ptr) ((void *)((unsigned long)(ptr) | 1)) -#define HANDLE_IS_NO_FREE(ptr) ((unsigned long)(ptr) & 1) - -struct um_pci_device { - struct virtio_device *vdev; - - /* for now just standard BARs */ - u8 resptr[PCI_STD_NUM_BARS]; - - struct virtqueue *cmd_vq, *irq_vq; - -#define UM_PCI_STAT_WAITING 0 - unsigned long status; - - int irq; - - bool platform; -}; - struct um_pci_device_reg { struct um_pci_device *dev; void __iomem *iomem; @@ -58,150 +32,15 @@ static struct irq_domain *um_pci_inner_domain; static struct irq_domain *um_pci_msi_domain; static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)]; -static unsigned int um_pci_max_delay_us = 40000; -module_param_named(max_delay_us, um_pci_max_delay_us, uint, 0644); - -struct um_pci_message_buffer { - struct virtio_pcidev_msg hdr; - u8 data[8]; -}; - -static struct um_pci_message_buffer __percpu *um_pci_msg_bufs; - -static int um_pci_send_cmd(struct um_pci_device *dev, - struct virtio_pcidev_msg *cmd, - unsigned int cmd_size, - const void *extra, unsigned int extra_size, - void *out, unsigned int out_size) -{ - struct scatterlist out_sg, extra_sg, in_sg; - struct scatterlist *sgs_list[] = { - [0] = &out_sg, - [1] = extra ? &extra_sg : &in_sg, - [2] = extra ? &in_sg : NULL, - }; - struct um_pci_message_buffer *buf; - int delay_count = 0; - int ret, len; - bool posted; - - if (WARN_ON(cmd_size < sizeof(*cmd) || cmd_size > sizeof(*buf))) - return -EINVAL; - - switch (cmd->op) { - case VIRTIO_PCIDEV_OP_CFG_WRITE: - case VIRTIO_PCIDEV_OP_MMIO_WRITE: - case VIRTIO_PCIDEV_OP_MMIO_MEMSET: - /* in PCI, writes are posted, so don't wait */ - posted = !out; - WARN_ON(!posted); - break; - default: - posted = false; - break; - } - - buf = get_cpu_var(um_pci_msg_bufs); - if (buf) - memcpy(buf, cmd, cmd_size); - - if (posted) { - u8 *ncmd = kmalloc(cmd_size + extra_size, GFP_ATOMIC); - - if (ncmd) { - memcpy(ncmd, cmd, cmd_size); - if (extra) - memcpy(ncmd + cmd_size, extra, extra_size); - cmd = (void *)ncmd; - cmd_size += extra_size; - extra = NULL; - extra_size = 0; - } else { - /* try without allocating memory */ - posted = false; - cmd = (void *)buf; - } - } else { - cmd = (void *)buf; - } - - sg_init_one(&out_sg, cmd, cmd_size); - if (extra) - sg_init_one(&extra_sg, extra, extra_size); - if (out) - sg_init_one(&in_sg, out, out_size); - - /* add to internal virtio queue */ - ret = virtqueue_add_sgs(dev->cmd_vq, sgs_list, - extra ? 2 : 1, - out ? 1 : 0, - posted ? cmd : HANDLE_NO_FREE(cmd), - GFP_ATOMIC); - if (ret) { - if (posted) - kfree(cmd); - goto out; - } - - if (posted) { - virtqueue_kick(dev->cmd_vq); - ret = 0; - goto out; - } - - /* kick and poll for getting a response on the queue */ - set_bit(UM_PCI_STAT_WAITING, &dev->status); - virtqueue_kick(dev->cmd_vq); - - while (1) { - void *completed = virtqueue_get_buf(dev->cmd_vq, &len); - - if (completed == HANDLE_NO_FREE(cmd)) - break; - - if (completed && !HANDLE_IS_NO_FREE(completed)) - kfree(completed); - - if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) || - ++delay_count > um_pci_max_delay_us, - "um virt-pci delay: %d", delay_count)) { - ret = -EIO; - break; - } - udelay(1); - } - clear_bit(UM_PCI_STAT_WAITING, &dev->status); - -out: - put_cpu_var(um_pci_msg_bufs); - return ret; -} - static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, int size) { struct um_pci_device_reg *reg = priv; struct um_pci_device *dev = reg->dev; - struct virtio_pcidev_msg hdr = { - .op = VIRTIO_PCIDEV_OP_CFG_READ, - .size = size, - .addr = offset, - }; - /* buf->data is maximum size - we may only use parts of it */ - struct um_pci_message_buffer *buf; - u8 *data; - unsigned long ret = ULONG_MAX; - size_t bytes = sizeof(buf->data); if (!dev) return ULONG_MAX; - buf = get_cpu_var(um_pci_msg_bufs); - data = buf->data; - - if (buf) - memset(data, 0xff, bytes); - switch (size) { case 1: case 2: @@ -212,34 +51,10 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, break; default: WARN(1, "invalid config space read size %d\n", size); - goto out; - } - - if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, bytes)) - goto out; - - switch (size) { - case 1: - ret = data[0]; - break; - case 2: - ret = le16_to_cpup((void *)data); - break; - case 4: - ret = le32_to_cpup((void *)data); - break; -#ifdef CONFIG_64BIT - case 8: - ret = le64_to_cpup((void *)data); - break; -#endif - default: - break; + return ULONG_MAX; } -out: - put_cpu_var(um_pci_msg_bufs); - return ret; + return dev->ops->cfgspace_read(dev, offset, size); } static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size, @@ -247,42 +62,24 @@ static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size, { struct um_pci_device_reg *reg = priv; struct um_pci_device *dev = reg->dev; - struct { - struct virtio_pcidev_msg hdr; - /* maximum size - we may only use parts of it */ - u8 data[8]; - } msg = { - .hdr = { - .op = VIRTIO_PCIDEV_OP_CFG_WRITE, - .size = size, - .addr = offset, - }, - }; if (!dev) return; switch (size) { case 1: - msg.data[0] = (u8)val; - break; case 2: - put_unaligned_le16(val, (void *)msg.data); - break; case 4: - put_unaligned_le32(val, (void *)msg.data); - break; #ifdef CONFIG_64BIT case 8: - put_unaligned_le64(val, (void *)msg.data); - break; #endif + break; default: WARN(1, "invalid config space write size %d\n", size); return; } - WARN_ON(um_pci_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0)); + dev->ops->cfgspace_write(dev, offset, size, val); } static const struct logic_iomem_ops um_pci_device_cfgspace_ops = { @@ -290,35 +87,14 @@ static const struct logic_iomem_ops um_pci_device_cfgspace_ops = { .write = um_pci_cfgspace_write, }; -static void um_pci_bar_copy_from(void *priv, void *buffer, - unsigned int offset, int size) +static unsigned long um_pci_bar_read(void *priv, unsigned int offset, + int size) { u8 *resptr = priv; struct um_pci_device *dev = container_of(resptr - *resptr, struct um_pci_device, resptr[0]); - struct virtio_pcidev_msg hdr = { - .op = VIRTIO_PCIDEV_OP_MMIO_READ, - .bar = *resptr, - .size = size, - .addr = offset, - }; - - memset(buffer, 0xff, size); - - um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, buffer, size); -} - -static unsigned long um_pci_bar_read(void *priv, unsigned int offset, - int size) -{ - /* buf->data is maximum size - we may only use parts of it */ - struct um_pci_message_buffer *buf; - u8 *data; - unsigned long ret = ULONG_MAX; - - buf = get_cpu_var(um_pci_msg_bufs); - data = buf->data; + u8 bar = *resptr; switch (size) { case 1: @@ -329,80 +105,60 @@ static unsigned long um_pci_bar_read(void *priv, unsigned int offset, #endif break; default: - WARN(1, "invalid config space read size %d\n", size); - goto out; + WARN(1, "invalid bar read size %d\n", size); + return ULONG_MAX; } - um_pci_bar_copy_from(priv, data, offset, size); + return dev->ops->bar_read(dev, bar, offset, size); +} + +static void um_pci_bar_write(void *priv, unsigned int offset, int size, + unsigned long val) +{ + u8 *resptr = priv; + struct um_pci_device *dev = container_of(resptr - *resptr, + struct um_pci_device, + resptr[0]); + u8 bar = *resptr; switch (size) { case 1: - ret = data[0]; - break; case 2: - ret = le16_to_cpup((void *)data); - break; case 4: - ret = le32_to_cpup((void *)data); - break; #ifdef CONFIG_64BIT case 8: - ret = le64_to_cpup((void *)data); - break; #endif - default: break; + default: + WARN(1, "invalid bar write size %d\n", size); + return; } -out: - put_cpu_var(um_pci_msg_bufs); - return ret; + dev->ops->bar_write(dev, bar, offset, size, val); } -static void um_pci_bar_copy_to(void *priv, unsigned int offset, - const void *buffer, int size) +static void um_pci_bar_copy_from(void *priv, void *buffer, + unsigned int offset, int size) { u8 *resptr = priv; struct um_pci_device *dev = container_of(resptr - *resptr, struct um_pci_device, resptr[0]); - struct virtio_pcidev_msg hdr = { - .op = VIRTIO_PCIDEV_OP_MMIO_WRITE, - .bar = *resptr, - .size = size, - .addr = offset, - }; + u8 bar = *resptr; - um_pci_send_cmd(dev, &hdr, sizeof(hdr), buffer, size, NULL, 0); + dev->ops->bar_copy_from(dev, bar, buffer, offset, size); } -static void um_pci_bar_write(void *priv, unsigned int offset, int size, - unsigned long val) +static void um_pci_bar_copy_to(void *priv, unsigned int offset, + const void *buffer, int size) { - /* maximum size - we may only use parts of it */ - u8 data[8]; - - switch (size) { - case 1: - data[0] = (u8)val; - break; - case 2: - put_unaligned_le16(val, (void *)data); - break; - case 4: - put_unaligned_le32(val, (void *)data); - break; -#ifdef CONFIG_64BIT - case 8: - put_unaligned_le64(val, (void *)data); - break; -#endif - default: - WARN(1, "invalid config space write size %d\n", size); - return; - } + u8 *resptr = priv; + struct um_pci_device *dev = container_of(resptr - *resptr, + struct um_pci_device, + resptr[0]); + u8 bar = *resptr; - um_pci_bar_copy_to(priv, offset, data, size); + dev->ops->bar_copy_to(dev, bar, offset, buffer, size); } static void um_pci_bar_set(void *priv, unsigned int offset, u8 value, int size) @@ -411,20 +167,9 @@ static void um_pci_bar_set(void *priv, unsigned int offset, u8 value, int size) struct um_pci_device *dev = container_of(resptr - *resptr, struct um_pci_device, resptr[0]); - struct { - struct virtio_pcidev_msg hdr; - u8 data; - } msg = { - .hdr = { - .op = VIRTIO_PCIDEV_OP_CFG_WRITE, - .bar = *resptr, - .size = size, - .addr = offset, - }, - .data = value, - }; + u8 bar = *resptr; - um_pci_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0); + dev->ops->bar_set(dev, bar, offset, value, size); } static const struct logic_iomem_ops um_pci_device_bar_ops = { @@ -471,79 +216,6 @@ static void um_pci_rescan(void) pci_unlock_rescan_remove(); } -static void um_pci_irq_vq_addbuf(struct virtqueue *vq, void *buf, bool kick) -{ - struct scatterlist sg[1]; - - sg_init_one(sg, buf, MAX_IRQ_MSG_SIZE); - if (virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC)) - kfree(buf); - else if (kick) - virtqueue_kick(vq); -} - -static void um_pci_handle_irq_message(struct virtqueue *vq, - struct virtio_pcidev_msg *msg) -{ - struct virtio_device *vdev = vq->vdev; - struct um_pci_device *dev = vdev->priv; - - if (!dev->irq) - return; - - /* we should properly chain interrupts, but on ARCH=um we don't care */ - - switch (msg->op) { - case VIRTIO_PCIDEV_OP_INT: - generic_handle_irq(dev->irq); - break; - case VIRTIO_PCIDEV_OP_MSI: - /* our MSI message is just the interrupt number */ - if (msg->size == sizeof(u32)) - generic_handle_irq(le32_to_cpup((void *)msg->data)); - else - generic_handle_irq(le16_to_cpup((void *)msg->data)); - break; - case VIRTIO_PCIDEV_OP_PME: - /* nothing to do - we already woke up due to the message */ - break; - default: - dev_err(&vdev->dev, "unexpected virt-pci message %d\n", msg->op); - break; - } -} - -static void um_pci_cmd_vq_cb(struct virtqueue *vq) -{ - struct virtio_device *vdev = vq->vdev; - struct um_pci_device *dev = vdev->priv; - void *cmd; - int len; - - if (test_bit(UM_PCI_STAT_WAITING, &dev->status)) - return; - - while ((cmd = virtqueue_get_buf(vq, &len))) { - if (WARN_ON(HANDLE_IS_NO_FREE(cmd))) - continue; - kfree(cmd); - } -} - -static void um_pci_irq_vq_cb(struct virtqueue *vq) -{ - struct virtio_pcidev_msg *msg; - int len; - - while ((msg = virtqueue_get_buf(vq, &len))) { - if (len >= sizeof(*msg)) - um_pci_handle_irq_message(vq, msg); - - /* recycle the message buffer */ - um_pci_irq_vq_addbuf(vq, msg, true); - } -} - #ifdef CONFIG_OF /* Copied from arch/x86/kernel/devicetree.c */ struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) @@ -565,199 +237,6 @@ struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) } #endif -static int um_pci_init_vqs(struct um_pci_device *dev) -{ - struct virtqueue *vqs[2]; - static const char *const names[2] = { "cmd", "irq" }; - vq_callback_t *cbs[2] = { um_pci_cmd_vq_cb, um_pci_irq_vq_cb }; - int err, i; - - err = virtio_find_vqs(dev->vdev, 2, vqs, cbs, names, NULL); - if (err) - return err; - - dev->cmd_vq = vqs[0]; - dev->irq_vq = vqs[1]; - - virtio_device_ready(dev->vdev); - - for (i = 0; i < NUM_IRQ_MSGS; i++) { - void *msg = kzalloc(MAX_IRQ_MSG_SIZE, GFP_KERNEL); - - if (msg) - um_pci_irq_vq_addbuf(dev->irq_vq, msg, false); - } - - virtqueue_kick(dev->irq_vq); - - return 0; -} - -static void __um_pci_virtio_platform_remove(struct virtio_device *vdev, - struct um_pci_device *dev) -{ - virtio_reset_device(vdev); - vdev->config->del_vqs(vdev); - - mutex_lock(&um_pci_mtx); - um_pci_platform_device = NULL; - mutex_unlock(&um_pci_mtx); - - kfree(dev); -} - -static int um_pci_virtio_platform_probe(struct virtio_device *vdev, - struct um_pci_device *dev) -{ - int ret; - - dev->platform = true; - - mutex_lock(&um_pci_mtx); - - if (um_pci_platform_device) { - mutex_unlock(&um_pci_mtx); - ret = -EBUSY; - goto out_free; - } - - ret = um_pci_init_vqs(dev); - if (ret) { - mutex_unlock(&um_pci_mtx); - goto out_free; - } - - um_pci_platform_device = dev; - - mutex_unlock(&um_pci_mtx); - - ret = of_platform_default_populate(vdev->dev.of_node, NULL, &vdev->dev); - if (ret) - __um_pci_virtio_platform_remove(vdev, dev); - - return ret; - -out_free: - kfree(dev); - return ret; -} - -static int um_pci_virtio_probe(struct virtio_device *vdev) -{ - struct um_pci_device *dev; - int i, free = -1; - int err = -ENOSPC; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; - - dev->vdev = vdev; - vdev->priv = dev; - - if (of_device_is_compatible(vdev->dev.of_node, "simple-bus")) - return um_pci_virtio_platform_probe(vdev, dev); - - mutex_lock(&um_pci_mtx); - for (i = 0; i < MAX_DEVICES; i++) { - if (um_pci_devices[i].dev) - continue; - free = i; - break; - } - - if (free < 0) - goto error; - - err = um_pci_init_vqs(dev); - if (err) - goto error; - - dev->irq = irq_alloc_desc(numa_node_id()); - if (dev->irq < 0) { - err = dev->irq; - goto err_reset; - } - um_pci_devices[free].dev = dev; - vdev->priv = dev; - - mutex_unlock(&um_pci_mtx); - - device_set_wakeup_enable(&vdev->dev, true); - - /* - * In order to do suspend-resume properly, don't allow VQs - * to be suspended. - */ - virtio_uml_set_no_vq_suspend(vdev, true); - - um_pci_rescan(); - return 0; -err_reset: - virtio_reset_device(vdev); - vdev->config->del_vqs(vdev); -error: - mutex_unlock(&um_pci_mtx); - kfree(dev); - return err; -} - -static void um_pci_virtio_remove(struct virtio_device *vdev) -{ - struct um_pci_device *dev = vdev->priv; - int i; - - if (dev->platform) { - of_platform_depopulate(&vdev->dev); - __um_pci_virtio_platform_remove(vdev, dev); - return; - } - - device_set_wakeup_enable(&vdev->dev, false); - - mutex_lock(&um_pci_mtx); - for (i = 0; i < MAX_DEVICES; i++) { - if (um_pci_devices[i].dev != dev) - continue; - - um_pci_devices[i].dev = NULL; - irq_free_desc(dev->irq); - - break; - } - mutex_unlock(&um_pci_mtx); - - if (i < MAX_DEVICES) { - struct pci_dev *pci_dev; - - pci_dev = pci_get_slot(bridge->bus, i); - if (pci_dev) - pci_stop_and_remove_bus_device_locked(pci_dev); - } - - /* Stop all virtqueues */ - virtio_reset_device(vdev); - dev->cmd_vq = NULL; - dev->irq_vq = NULL; - vdev->config->del_vqs(vdev); - - kfree(dev); -} - -static struct virtio_device_id id_table[] = { - { CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID, VIRTIO_DEV_ANY_ID }, - { 0 }, -}; -MODULE_DEVICE_TABLE(virtio, id_table); - -static struct virtio_driver um_pci_virtio_driver = { - .driver.name = "virtio-pci", - .driver.owner = THIS_MODULE, - .id_table = id_table, - .probe = um_pci_virtio_probe, - .remove = um_pci_virtio_remove, -}; - static struct resource virt_cfgspace_resource = { .name = "PCI config space", .start = 0xf0000000 - MAX_DEVICES * CFG_SPACE_SIZE, @@ -876,7 +355,7 @@ static void um_pci_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) } static struct irq_chip um_pci_msi_bottom_irq_chip = { - .name = "UM virtio MSI", + .name = "UM virtual MSI", .irq_compose_msi_msg = um_pci_compose_msi_msg, }; @@ -926,7 +405,7 @@ static const struct irq_domain_ops um_pci_inner_domain_ops = { }; static struct irq_chip um_pci_msi_irq_chip = { - .name = "UM virtio PCIe MSI", + .name = "UM virtual PCIe MSI", .irq_mask = pci_msi_mask_irq, .irq_unmask = pci_msi_unmask_irq, }; @@ -985,8 +464,85 @@ static struct resource virt_platform_resource = { .flags = IORESOURCE_MEM, }; +int um_pci_device_register(struct um_pci_device *dev) +{ + int i, free = -1; + int err = 0; + + mutex_lock(&um_pci_mtx); + for (i = 0; i < MAX_DEVICES; i++) { + if (um_pci_devices[i].dev) + continue; + free = i; + break; + } + + if (free < 0) { + err = -ENOSPC; + goto out; + } + + dev->irq = irq_alloc_desc(numa_node_id()); + if (dev->irq < 0) { + err = dev->irq; + goto out; + } + + um_pci_devices[free].dev = dev; + +out: + mutex_unlock(&um_pci_mtx); + if (!err) + um_pci_rescan(); + return err; +} + +void um_pci_device_unregister(struct um_pci_device *dev) +{ + int i; + + mutex_lock(&um_pci_mtx); + for (i = 0; i < MAX_DEVICES; i++) { + if (um_pci_devices[i].dev != dev) + continue; + um_pci_devices[i].dev = NULL; + irq_free_desc(dev->irq); + break; + } + mutex_unlock(&um_pci_mtx); + + if (i < MAX_DEVICES) { + struct pci_dev *pci_dev; + + pci_dev = pci_get_slot(bridge->bus, i); + if (pci_dev) + pci_stop_and_remove_bus_device_locked(pci_dev); + } +} + +int um_pci_platform_device_register(struct um_pci_device *dev) +{ + guard(mutex)(&um_pci_mtx); + if (um_pci_platform_device) + return -EBUSY; + um_pci_platform_device = dev; + return 0; +} + +void um_pci_platform_device_unregister(struct um_pci_device *dev) +{ + guard(mutex)(&um_pci_mtx); + if (um_pci_platform_device == dev) + um_pci_platform_device = NULL; +} + static int __init um_pci_init(void) { + struct irq_domain_info inner_domain_info = { + .size = MAX_MSI_VECTORS, + .hwirq_max = MAX_MSI_VECTORS, + .ops = &um_pci_inner_domain_ops, + }; int err, i; WARN_ON(logic_iomem_add_region(&virt_cfgspace_resource, @@ -996,14 +552,6 @@ static int __init um_pci_init(void) WARN_ON(logic_iomem_add_region(&virt_platform_resource, &um_pci_platform_ops)); - if (WARN(CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID < 0, - "No virtio device ID configured for PCI - no PCI support\n")) - return 0; - - um_pci_msg_bufs = alloc_percpu(struct um_pci_message_buffer); - if (!um_pci_msg_bufs) - return -ENOMEM; - bridge = pci_alloc_host_bridge(0); if (!bridge) { err = -ENOMEM; @@ -1016,11 +564,10 @@ static int __init um_pci_init(void) goto free; } - um_pci_inner_domain = __irq_domain_add(um_pci_fwnode, MAX_MSI_VECTORS, - MAX_MSI_VECTORS, 0, - &um_pci_inner_domain_ops, NULL); - if (!um_pci_inner_domain) { - err = -ENOMEM; + inner_domain_info.fwnode = um_pci_fwnode; + um_pci_inner_domain = irq_domain_instantiate(&inner_domain_info); + if (IS_ERR(um_pci_inner_domain)) { + err = PTR_ERR(um_pci_inner_domain); goto free; } @@ -1052,12 +599,10 @@ static int __init um_pci_init(void) if (err) goto free; - err = register_virtio_driver(&um_pci_virtio_driver); - if (err) - goto free; return 0; + free: - if (um_pci_inner_domain) + if (!IS_ERR_OR_NULL(um_pci_inner_domain)) irq_domain_remove(um_pci_inner_domain); if (um_pci_fwnode) irq_domain_free_fwnode(um_pci_fwnode); @@ -1065,18 +610,15 @@ free: pci_free_resource_list(&bridge->windows); pci_free_host_bridge(bridge); } - free_percpu(um_pci_msg_bufs); return err; } -module_init(um_pci_init); +device_initcall(um_pci_init); static void __exit um_pci_exit(void) { - unregister_virtio_driver(&um_pci_virtio_driver); irq_domain_remove(um_pci_msi_domain); irq_domain_remove(um_pci_inner_domain); pci_free_resource_list(&bridge->windows); pci_free_host_bridge(bridge); - free_percpu(um_pci_msg_bufs); } module_exit(um_pci_exit); diff --git a/arch/um/drivers/virt-pci.h b/arch/um/drivers/virt-pci.h new file mode 100644 index 000000000000..b20d1475d1eb --- /dev/null +++ b/arch/um/drivers/virt-pci.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_VIRT_PCI_H +#define __UM_VIRT_PCI_H + +#include <linux/pci.h> + +struct um_pci_device { + const struct um_pci_ops *ops; + + /* for now just standard BARs */ + u8 resptr[PCI_STD_NUM_BARS]; + + int irq; +}; + +struct um_pci_ops { + unsigned long (*cfgspace_read)(struct um_pci_device *dev, + unsigned int offset, int size); + void (*cfgspace_write)(struct um_pci_device *dev, unsigned int offset, + int size, unsigned long val); + + unsigned long (*bar_read)(struct um_pci_device *dev, int bar, + unsigned int offset, int size); + void (*bar_write)(struct um_pci_device *dev, int bar, + unsigned int offset, int size, unsigned long val); + + void (*bar_copy_from)(struct um_pci_device *dev, int bar, void *buffer, + unsigned int offset, int size); + void (*bar_copy_to)(struct um_pci_device *dev, int bar, + unsigned int offset, const void *buffer, int size); + void (*bar_set)(struct um_pci_device *dev, int bar, + unsigned int offset, u8 value, int size); +}; + +int um_pci_device_register(struct um_pci_device *dev); +void um_pci_device_unregister(struct um_pci_device *dev); + +int um_pci_platform_device_register(struct um_pci_device *dev); +void um_pci_platform_device_unregister(struct um_pci_device *dev); + +#endif /* __UM_VIRT_PCI_H */ diff --git a/arch/um/drivers/virtio_pcidev.c b/arch/um/drivers/virtio_pcidev.c new file mode 100644 index 000000000000..3c4c4c928fdd --- /dev/null +++ b/arch/um/drivers/virtio_pcidev.c @@ -0,0 +1,628 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Intel Corporation + * Author: Johannes Berg <johannes@sipsolutions.net> + */ +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/logic_iomem.h> +#include <linux/of_platform.h> +#include <linux/irqdomain.h> +#include <linux/virtio_pcidev.h> +#include <linux/virtio-uml.h> +#include <linux/delay.h> +#include <linux/msi.h> +#include <linux/unaligned.h> +#include <irq_kern.h> + +#include "virt-pci.h" + +#define to_virtio_pcidev(_pdev) \ + container_of(_pdev, struct virtio_pcidev_device, pdev) + +/* for MSI-X we have a 32-bit payload */ +#define MAX_IRQ_MSG_SIZE (sizeof(struct virtio_pcidev_msg) + sizeof(u32)) +#define NUM_IRQ_MSGS 10 + +struct virtio_pcidev_message_buffer { + struct virtio_pcidev_msg hdr; + u8 data[8]; +}; + +struct virtio_pcidev_device { + struct um_pci_device pdev; + struct virtio_device *vdev; + + struct virtqueue *cmd_vq, *irq_vq; + +#define VIRTIO_PCIDEV_WRITE_BUFS 20 + struct virtio_pcidev_message_buffer bufs[VIRTIO_PCIDEV_WRITE_BUFS + 1]; + void *extra_ptrs[VIRTIO_PCIDEV_WRITE_BUFS + 1]; + DECLARE_BITMAP(used_bufs, VIRTIO_PCIDEV_WRITE_BUFS); + +#define UM_PCI_STAT_WAITING 0 + unsigned long status; + + bool platform; +}; + +static unsigned int virtio_pcidev_max_delay_us = 40000; +module_param_named(max_delay_us, virtio_pcidev_max_delay_us, uint, 0644); + +static int virtio_pcidev_get_buf(struct virtio_pcidev_device *dev, bool *posted) +{ + int i; + + for (i = 0; i < VIRTIO_PCIDEV_WRITE_BUFS; i++) { + if (!test_and_set_bit(i, dev->used_bufs)) + return i; + } + + *posted = false; + return VIRTIO_PCIDEV_WRITE_BUFS; +} + +static void virtio_pcidev_free_buf(struct virtio_pcidev_device *dev, void *buf) +{ + int i; + + if (buf == &dev->bufs[VIRTIO_PCIDEV_WRITE_BUFS]) { + kfree(dev->extra_ptrs[VIRTIO_PCIDEV_WRITE_BUFS]); + dev->extra_ptrs[VIRTIO_PCIDEV_WRITE_BUFS] = NULL; + return; + } + + for (i = 0; i < VIRTIO_PCIDEV_WRITE_BUFS; i++) { + if (buf == &dev->bufs[i]) { + kfree(dev->extra_ptrs[i]); + dev->extra_ptrs[i] = NULL; + WARN_ON(!test_and_clear_bit(i, dev->used_bufs)); + return; + } + } + + WARN_ON(1); +} + +static int virtio_pcidev_send_cmd(struct virtio_pcidev_device *dev, + struct virtio_pcidev_msg *cmd, + unsigned int cmd_size, + const void *extra, unsigned int extra_size, + void *out, unsigned int out_size) +{ + struct scatterlist out_sg, extra_sg, in_sg; + struct scatterlist *sgs_list[] = { + [0] = &out_sg, + [1] = extra ? &extra_sg : &in_sg, + [2] = extra ? &in_sg : NULL, + }; + struct virtio_pcidev_message_buffer *buf; + int delay_count = 0; + bool bounce_out; + int ret, len; + int buf_idx; + bool posted; + + if (WARN_ON(cmd_size < sizeof(*cmd) || cmd_size > sizeof(*buf))) + return -EINVAL; + + switch (cmd->op) { + case VIRTIO_PCIDEV_OP_CFG_WRITE: + case VIRTIO_PCIDEV_OP_MMIO_WRITE: + case VIRTIO_PCIDEV_OP_MMIO_MEMSET: + /* in PCI, writes are posted, so don't wait */ + posted = !out; + WARN_ON(!posted); + break; + default: + posted = false; + break; + } + + bounce_out = !posted && cmd_size <= sizeof(*cmd) && + out && out_size <= sizeof(buf->data); + + buf_idx = virtio_pcidev_get_buf(dev, &posted); + buf = &dev->bufs[buf_idx]; + memcpy(buf, cmd, cmd_size); + + if (posted && extra && extra_size > sizeof(buf) - cmd_size) { + dev->extra_ptrs[buf_idx] = kmemdup(extra, extra_size, + GFP_ATOMIC); + + if (!dev->extra_ptrs[buf_idx]) { + virtio_pcidev_free_buf(dev, buf); + return -ENOMEM; + } + extra = dev->extra_ptrs[buf_idx]; + } else if (extra && extra_size <= sizeof(buf) - cmd_size) { + memcpy((u8 *)buf + cmd_size, extra, extra_size); + cmd_size += extra_size; + extra_size = 0; + extra = NULL; + cmd = (void *)buf; + } else { + cmd = (void *)buf; + } + + sg_init_one(&out_sg, cmd, cmd_size); + if (extra) + sg_init_one(&extra_sg, extra, extra_size); + /* allow stack for small buffers */ + if (bounce_out) + sg_init_one(&in_sg, buf->data, out_size); + else if (out) + sg_init_one(&in_sg, out, out_size); + + /* add to internal virtio queue */ + ret = virtqueue_add_sgs(dev->cmd_vq, sgs_list, + extra ? 2 : 1, + out ? 1 : 0, + cmd, GFP_ATOMIC); + if (ret) { + virtio_pcidev_free_buf(dev, buf); + return ret; + } + + if (posted) { + virtqueue_kick(dev->cmd_vq); + return 0; + } + + /* kick and poll for getting a response on the queue */ + set_bit(UM_PCI_STAT_WAITING, &dev->status); + virtqueue_kick(dev->cmd_vq); + ret = 0; + + while (1) { + void *completed = virtqueue_get_buf(dev->cmd_vq, &len); + + if (completed == buf) + break; + + if (completed) + virtio_pcidev_free_buf(dev, completed); + + if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) || + ++delay_count > virtio_pcidev_max_delay_us, + "um virt-pci delay: %d", delay_count)) { + ret = -EIO; + break; + } + udelay(1); + } + clear_bit(UM_PCI_STAT_WAITING, &dev->status); + + if (bounce_out) + memcpy(out, buf->data, out_size); + + virtio_pcidev_free_buf(dev, buf); + + return ret; +} + +static unsigned long virtio_pcidev_cfgspace_read(struct um_pci_device *pdev, + unsigned int offset, int size) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct virtio_pcidev_msg hdr = { + .op = VIRTIO_PCIDEV_OP_CFG_READ, + .size = size, + .addr = offset, + }; + /* max 8, we might not use it all */ + u8 data[8]; + + memset(data, 0xff, sizeof(data)); + + /* size has been checked in um_pci_cfgspace_read() */ + if (virtio_pcidev_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, size)) + return ULONG_MAX; + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ULONG_MAX; + } +} + +static void virtio_pcidev_cfgspace_write(struct um_pci_device *pdev, + unsigned int offset, int size, + unsigned long val) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct { + struct virtio_pcidev_msg hdr; + /* maximum size - we may only use parts of it */ + u8 data[8]; + } msg = { + .hdr = { + .op = VIRTIO_PCIDEV_OP_CFG_WRITE, + .size = size, + .addr = offset, + }, + }; + + /* size has been checked in um_pci_cfgspace_write() */ + switch (size) { + case 1: + msg.data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)msg.data); + break; + case 4: + put_unaligned_le32(val, (void *)msg.data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)msg.data); + break; +#endif + } + + WARN_ON(virtio_pcidev_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0)); +} + +static void virtio_pcidev_bar_copy_from(struct um_pci_device *pdev, + int bar, void *buffer, + unsigned int offset, int size) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct virtio_pcidev_msg hdr = { + .op = VIRTIO_PCIDEV_OP_MMIO_READ, + .bar = bar, + .size = size, + .addr = offset, + }; + + memset(buffer, 0xff, size); + + virtio_pcidev_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, buffer, size); +} + +static unsigned long virtio_pcidev_bar_read(struct um_pci_device *pdev, int bar, + unsigned int offset, int size) +{ + /* 8 is maximum size - we may only use parts of it */ + u8 data[8]; + + /* size has been checked in um_pci_bar_read() */ + virtio_pcidev_bar_copy_from(pdev, bar, data, offset, size); + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ULONG_MAX; + } +} + +static void virtio_pcidev_bar_copy_to(struct um_pci_device *pdev, + int bar, unsigned int offset, + const void *buffer, int size) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct virtio_pcidev_msg hdr = { + .op = VIRTIO_PCIDEV_OP_MMIO_WRITE, + .bar = bar, + .size = size, + .addr = offset, + }; + + virtio_pcidev_send_cmd(dev, &hdr, sizeof(hdr), buffer, size, NULL, 0); +} + +static void virtio_pcidev_bar_write(struct um_pci_device *pdev, int bar, + unsigned int offset, int size, + unsigned long val) +{ + /* maximum size - we may only use parts of it */ + u8 data[8]; + + /* size has been checked in um_pci_bar_write() */ + switch (size) { + case 1: + data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)data); + break; + case 4: + put_unaligned_le32(val, (void *)data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)data); + break; +#endif + } + + virtio_pcidev_bar_copy_to(pdev, bar, offset, data, size); +} + +static void virtio_pcidev_bar_set(struct um_pci_device *pdev, int bar, + unsigned int offset, u8 value, int size) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct { + struct virtio_pcidev_msg hdr; + u8 data; + } msg = { + .hdr = { + .op = VIRTIO_PCIDEV_OP_CFG_WRITE, + .bar = bar, + .size = size, + .addr = offset, + }, + .data = value, + }; + + virtio_pcidev_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0); +} + +static const struct um_pci_ops virtio_pcidev_um_pci_ops = { + .cfgspace_read = virtio_pcidev_cfgspace_read, + .cfgspace_write = virtio_pcidev_cfgspace_write, + .bar_read = virtio_pcidev_bar_read, + .bar_write = virtio_pcidev_bar_write, + .bar_copy_from = virtio_pcidev_bar_copy_from, + .bar_copy_to = virtio_pcidev_bar_copy_to, + .bar_set = virtio_pcidev_bar_set, +}; + +static void virtio_pcidev_irq_vq_addbuf(struct virtqueue *vq, void *buf, bool kick) +{ + struct scatterlist sg[1]; + + sg_init_one(sg, buf, MAX_IRQ_MSG_SIZE); + if (virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC)) + kfree(buf); + else if (kick) + virtqueue_kick(vq); +} + +static void virtio_pcidev_handle_irq_message(struct virtqueue *vq, + struct virtio_pcidev_msg *msg) +{ + struct virtio_device *vdev = vq->vdev; + struct virtio_pcidev_device *dev = vdev->priv; + + if (!dev->pdev.irq) + return; + + /* we should properly chain interrupts, but on ARCH=um we don't care */ + + switch (msg->op) { + case VIRTIO_PCIDEV_OP_INT: + generic_handle_irq(dev->pdev.irq); + break; + case VIRTIO_PCIDEV_OP_MSI: + /* our MSI message is just the interrupt number */ + if (msg->size == sizeof(u32)) + generic_handle_irq(le32_to_cpup((void *)msg->data)); + else + generic_handle_irq(le16_to_cpup((void *)msg->data)); + break; + case VIRTIO_PCIDEV_OP_PME: + /* nothing to do - we already woke up due to the message */ + break; + default: + dev_err(&vdev->dev, "unexpected virt-pci message %d\n", msg->op); + break; + } +} + +static void virtio_pcidev_cmd_vq_cb(struct virtqueue *vq) +{ + struct virtio_device *vdev = vq->vdev; + struct virtio_pcidev_device *dev = vdev->priv; + void *cmd; + int len; + + if (test_bit(UM_PCI_STAT_WAITING, &dev->status)) + return; + + while ((cmd = virtqueue_get_buf(vq, &len))) + virtio_pcidev_free_buf(dev, cmd); +} + +static void virtio_pcidev_irq_vq_cb(struct virtqueue *vq) +{ + struct virtio_pcidev_msg *msg; + int len; + + while ((msg = virtqueue_get_buf(vq, &len))) { + if (len >= sizeof(*msg)) + virtio_pcidev_handle_irq_message(vq, msg); + + /* recycle the message buffer */ + virtio_pcidev_irq_vq_addbuf(vq, msg, true); + } +} + +static int virtio_pcidev_init_vqs(struct virtio_pcidev_device *dev) +{ + struct virtqueue_info vqs_info[] = { + { "cmd", virtio_pcidev_cmd_vq_cb }, + { "irq", virtio_pcidev_irq_vq_cb }, + }; + struct virtqueue *vqs[2]; + int err, i; + + err = virtio_find_vqs(dev->vdev, 2, vqs, vqs_info, NULL); + if (err) + return err; + + dev->cmd_vq = vqs[0]; + dev->irq_vq = vqs[1]; + + virtio_device_ready(dev->vdev); + + for (i = 0; i < NUM_IRQ_MSGS; i++) { + void *msg = kzalloc(MAX_IRQ_MSG_SIZE, GFP_KERNEL); + + if (msg) + virtio_pcidev_irq_vq_addbuf(dev->irq_vq, msg, false); + } + + virtqueue_kick(dev->irq_vq); + + return 0; +} + +static void __virtio_pcidev_virtio_platform_remove(struct virtio_device *vdev, + struct virtio_pcidev_device *dev) +{ + um_pci_platform_device_unregister(&dev->pdev); + + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); + + kfree(dev); +} + +static int virtio_pcidev_virtio_platform_probe(struct virtio_device *vdev, + struct virtio_pcidev_device *dev) +{ + int err; + + dev->platform = true; + + err = virtio_pcidev_init_vqs(dev); + if (err) + goto err_free; + + err = um_pci_platform_device_register(&dev->pdev); + if (err) + goto err_reset; + + err = of_platform_default_populate(vdev->dev.of_node, NULL, &vdev->dev); + if (err) + goto err_unregister; + + return 0; + +err_unregister: + um_pci_platform_device_unregister(&dev->pdev); +err_reset: + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); +err_free: + kfree(dev); + return err; +} + +static int virtio_pcidev_virtio_probe(struct virtio_device *vdev) +{ + struct virtio_pcidev_device *dev; + int err; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->vdev = vdev; + vdev->priv = dev; + + dev->pdev.ops = &virtio_pcidev_um_pci_ops; + + if (of_device_is_compatible(vdev->dev.of_node, "simple-bus")) + return virtio_pcidev_virtio_platform_probe(vdev, dev); + + err = virtio_pcidev_init_vqs(dev); + if (err) + goto err_free; + + err = um_pci_device_register(&dev->pdev); + if (err) + goto err_reset; + + device_set_wakeup_enable(&vdev->dev, true); + + /* + * In order to do suspend-resume properly, don't allow VQs + * to be suspended. + */ + virtio_uml_set_no_vq_suspend(vdev, true); + + return 0; + +err_reset: + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); +err_free: + kfree(dev); + return err; +} + +static void virtio_pcidev_virtio_remove(struct virtio_device *vdev) +{ + struct virtio_pcidev_device *dev = vdev->priv; + + if (dev->platform) { + of_platform_depopulate(&vdev->dev); + __virtio_pcidev_virtio_platform_remove(vdev, dev); + return; + } + + device_set_wakeup_enable(&vdev->dev, false); + + um_pci_device_unregister(&dev->pdev); + + /* Stop all virtqueues */ + virtio_reset_device(vdev); + dev->cmd_vq = NULL; + dev->irq_vq = NULL; + vdev->config->del_vqs(vdev); + + kfree(dev); +} + +static struct virtio_device_id id_table[] = { + { CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; +MODULE_DEVICE_TABLE(virtio, id_table); + +static struct virtio_driver virtio_pcidev_virtio_driver = { + .driver.name = "virtio-pci", + .id_table = id_table, + .probe = virtio_pcidev_virtio_probe, + .remove = virtio_pcidev_virtio_remove, +}; + +static int __init virtio_pcidev_init(void) +{ + if (WARN(CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID < 0, + "No virtio device ID configured for PCI - no PCI support\n")) + return 0; + + return register_virtio_driver(&virtio_pcidev_virtio_driver); +} +late_initcall(virtio_pcidev_init); + +static void __exit virtio_pcidev_exit(void) +{ + unregister_virtio_driver(&virtio_pcidev_virtio_driver); +} +module_exit(virtio_pcidev_exit); diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 8adca2000e51..ad8d78fb1d9a 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -52,10 +52,11 @@ struct virtio_uml_device { struct platform_device *pdev; struct virtio_uml_platform_data *pdata; - spinlock_t sock_lock; + raw_spinlock_t sock_lock; int sock, req_fd, irq; u64 features; u64 protocol_features; + u64 max_vqs; u8 status; u8 registered:1; u8 suspended:1; @@ -72,8 +73,6 @@ struct virtio_uml_vq_info { bool suspended; }; -extern unsigned long long physmem_size, highmem; - #define vu_err(vu_dev, ...) dev_err(&(vu_dev)->pdev->dev, ##__VA_ARGS__) /* Vhost-user protocol */ @@ -247,7 +246,7 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev, if (request_ack) msg->header.flags |= VHOST_USER_FLAG_NEED_REPLY; - spin_lock_irqsave(&vu_dev->sock_lock, flags); + raw_spin_lock_irqsave(&vu_dev->sock_lock, flags); rc = full_sendmsg_fds(vu_dev->sock, msg, size, fds, num_fds); if (rc < 0) goto out; @@ -267,7 +266,7 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev, } out: - spin_unlock_irqrestore(&vu_dev->sock_lock, flags); + raw_spin_unlock_irqrestore(&vu_dev->sock_lock, flags); return rc; } @@ -343,6 +342,17 @@ static int vhost_user_set_protocol_features(struct virtio_uml_device *vu_dev, protocol_features); } +static int vhost_user_get_queue_num(struct virtio_uml_device *vu_dev, + u64 *queue_num) +{ + int rc = vhost_user_send_no_payload(vu_dev, true, + VHOST_USER_GET_QUEUE_NUM); + + if (rc) + return rc; + return vhost_user_recv_u64(vu_dev, queue_num); +} + static void vhost_user_reply(struct virtio_uml_device *vu_dev, struct vhost_user_msg *msg, int response) { @@ -516,6 +526,15 @@ static int vhost_user_init(struct virtio_uml_device *vu_dev) return rc; } + if (vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_MQ)) { + rc = vhost_user_get_queue_num(vu_dev, &vu_dev->max_vqs); + if (rc) + return rc; + } else { + vu_dev->max_vqs = U64_MAX; + } + return 0; } @@ -625,7 +644,7 @@ static int vhost_user_set_mem_table(struct virtio_uml_device *vu_dev) { struct vhost_user_msg msg = { .header.request = VHOST_USER_SET_MEM_TABLE, - .header.size = sizeof(msg.payload.mem_regions), + .header.size = offsetof(typeof(msg.payload.mem_regions), regions[1]), .payload.mem_regions.num = 1, }; unsigned long reserved = uml_reserved - uml_physmem; @@ -673,13 +692,6 @@ static int vhost_user_set_mem_table(struct virtio_uml_device *vu_dev) if (rc < 0) return rc; - if (highmem) { - msg.payload.mem_regions.num++; - rc = vhost_user_init_mem_region(__pa(end_iomem), highmem, - &fds[1], &msg.payload.mem_regions.regions[1]); - if (rc < 0) - return rc; - } return vhost_user_send(vu_dev, false, &msg, fds, msg.payload.mem_regions.num); @@ -897,7 +909,7 @@ static int vu_setup_vq_call_fd(struct virtio_uml_device *vu_dev, { struct virtio_uml_vq_info *info = vq->priv; int call_fds[2]; - int rc; + int rc, irq; /* no call FD needed/desired in this case */ if (vu_dev->protocol_features & @@ -914,19 +926,23 @@ static int vu_setup_vq_call_fd(struct virtio_uml_device *vu_dev, return rc; info->call_fd = call_fds[0]; - rc = um_request_irq(vu_dev->irq, info->call_fd, IRQ_READ, - vu_interrupt, IRQF_SHARED, info->name, vq); - if (rc < 0) + irq = um_request_irq(vu_dev->irq, info->call_fd, IRQ_READ, + vu_interrupt, IRQF_SHARED, info->name, vq); + if (irq < 0) { + rc = irq; goto close_both; + } rc = vhost_user_set_vring_call(vu_dev, vq->index, call_fds[1]); if (rc) goto release_irq; + vu_dev->irq = irq; + goto out; release_irq: - um_free_irq(vu_dev->irq, vq); + um_free_irq(irq, vq); close_both: os_close_file(call_fds[0]); out: @@ -1014,8 +1030,8 @@ error_kzalloc: } static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], const bool *ctx, + struct virtqueue *vqs[], + struct virtqueue_info vqs_info[], struct irq_affinity *desc) { struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); @@ -1023,7 +1039,9 @@ static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vq; /* not supported for now */ - if (WARN_ON(nvqs > 64)) + if (WARN(nvqs > 64 || nvqs > vu_dev->max_vqs, + "%d VQs requested, only up to 64 or %lld supported\n", + nvqs, vu_dev->max_vqs)) return -EINVAL; rc = vhost_user_set_mem_table(vu_dev); @@ -1031,13 +1049,15 @@ static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs, return rc; for (i = 0; i < nvqs; ++i) { - if (!names[i]) { + struct virtqueue_info *vqi = &vqs_info[i]; + + if (!vqi->name) { vqs[i] = NULL; continue; } - vqs[i] = vu_setup_vq(vdev, queue_idx++, callbacks[i], names[i], - ctx ? ctx[i] : false); + vqs[i] = vu_setup_vq(vdev, queue_idx++, vqi->callback, + vqi->name, vqi->ctx); if (IS_ERR(vqs[i])) { rc = PTR_ERR(vqs[i]); goto error_setup; @@ -1208,6 +1228,7 @@ static int virtio_uml_probe(struct platform_device *pdev) vu_dev->vdev.id.vendor = VIRTIO_DEV_ANY_ID; vu_dev->pdev = pdev; vu_dev->req_fd = -1; + vu_dev->irq = UM_IRQ_ALLOC; time_travel_propagate_time(); @@ -1218,7 +1239,7 @@ static int virtio_uml_probe(struct platform_device *pdev) goto error_free; vu_dev->sock = rc; - spin_lock_init(&vu_dev->sock_lock); + raw_spin_lock_init(&vu_dev->sock_lock); rc = vhost_user_init(vu_dev); if (rc) @@ -1241,12 +1262,11 @@ error_free: return rc; } -static int virtio_uml_remove(struct platform_device *pdev) +static void virtio_uml_remove(struct platform_device *pdev) { struct virtio_uml_device *vu_dev = platform_get_drvdata(pdev); unregister_virtio_device(&vu_dev->vdev); - return 0; } /* Command line device list */ diff --git a/arch/um/drivers/xterm.c b/arch/um/drivers/xterm.c index 6918de5e2956..e4316c7981e8 100644 --- a/arch/um/drivers/xterm.c +++ b/arch/um/drivers/xterm.c @@ -156,7 +156,7 @@ static int xterm_open(int input, int output, int primary, void *d, new = xterm_fd(fd, &data->helper_pid); if (new < 0) { err = new; - printk(UM_KERN_ERR "xterm_open : os_rcv_fd failed, err = %d\n", + printk(UM_KERN_ERR "xterm_open : xterm_fd failed, err = %d\n", -err); goto out_kill; } diff --git a/arch/um/drivers/xterm_kern.c b/arch/um/drivers/xterm_kern.c index 8011e51993d5..3971252cb1a6 100644 --- a/arch/um/drivers/xterm_kern.c +++ b/arch/um/drivers/xterm_kern.c @@ -21,12 +21,19 @@ struct xterm_wait { static irqreturn_t xterm_interrupt(int irq, void *data) { struct xterm_wait *xterm = data; - int fd; + int fd = -1, n_fds = 1; + ssize_t ret; - fd = os_rcv_fd(xterm->fd, &xterm->pid); - if (fd == -EAGAIN) + ret = os_rcv_fd_msg(xterm->fd, &fd, n_fds, + &xterm->pid, sizeof(xterm->pid)); + if (ret == -EAGAIN) return IRQ_NONE; + if (ret < 0) + fd = ret; + else if (ret != sizeof(xterm->pid)) + fd = -EMSGSIZE; + xterm->new_fd = fd; complete(&xterm->ready); diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index b2d834a29f3a..04ab3b653a48 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -1,14 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 -generic-y += bpf_perf_event.h generic-y += bug.h generic-y += compat.h -generic-y += current.h generic-y += device.h generic-y += dma-mapping.h generic-y += emergency-restart.h generic-y += exec.h generic-y += extable.h -generic-y += fb.h generic-y += ftrace.h generic-y += hw_irq.h generic-y += irq_regs.h @@ -16,11 +13,13 @@ generic-y += irq_work.h generic-y += kdebug.h generic-y += mcs_spinlock.h generic-y += mmiowb.h +generic-y += module.h generic-y += module.lds.h generic-y += param.h generic-y += parport.h generic-y += percpu.h generic-y += preempt.h +generic-y += runtime-const.h generic-y += softirq_stack.h generic-y += switch_to.h generic-y += topology.h @@ -28,3 +27,4 @@ generic-y += trace_clock.h generic-y += kprobes.h generic-y += mm_hooks.h generic-y += vga.h +generic-y += video.h diff --git a/arch/um/include/asm/bpf_perf_event.h b/arch/um/include/asm/bpf_perf_event.h new file mode 100644 index 000000000000..287221342d2c --- /dev/null +++ b/arch/um/include/asm/bpf_perf_event.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * asm-generic/bpf_perf_event.h is part of the uapi headers, but since + * arch/um has no uapi of its on, we can't use the "generic-y" + * Kbuild rule to generate the wrapper + */ + +#include <asm-generic/bpf_perf_event.h> diff --git a/arch/um/include/asm/cpufeature.h b/arch/um/include/asm/cpufeature.h index 66fe06db872f..1eb8b834fbec 100644 --- a/arch/um/include/asm/cpufeature.h +++ b/arch/um/include/asm/cpufeature.h @@ -38,8 +38,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define this_cpu_has(bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ - x86_this_cpu_test_bit(bit, \ - (unsigned long __percpu *)&cpu_info.x86_capability)) + x86_this_cpu_test_bit(bit, cpu_info.x86_capability)) /* * This macro is for detection of features which need kernel diff --git a/arch/um/include/asm/current.h b/arch/um/include/asm/current.h new file mode 100644 index 000000000000..de64e032d66c --- /dev/null +++ b/arch/um/include/asm/current.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_CURRENT_H +#define __ASM_CURRENT_H + +#include <linux/compiler.h> +#include <linux/threads.h> + +#ifndef __ASSEMBLY__ + +struct task_struct; +extern struct task_struct *cpu_tasks[NR_CPUS]; + +static __always_inline struct task_struct *get_current(void) +{ + return cpu_tasks[0]; +} + + +#define current get_current() + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_CURRENT_H */ diff --git a/arch/um/include/asm/fixmap.h b/arch/um/include/asm/fixmap.h deleted file mode 100644 index 2efac5827188..000000000000 --- a/arch/um/include/asm/fixmap.h +++ /dev/null @@ -1,56 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __UM_FIXMAP_H -#define __UM_FIXMAP_H - -#include <asm/processor.h> -#include <asm/archparam.h> -#include <asm/page.h> -#include <linux/threads.h> - -/* - * Here we define all the compile-time 'special' virtual - * addresses. The point is to have a constant address at - * compile time, but to set the physical address only - * in the boot process. We allocate these special addresses - * from the end of virtual memory (0xfffff000) backwards. - * Also this lets us do fail-safe vmalloc(), we - * can guarantee that these special addresses and - * vmalloc()-ed addresses never overlap. - * - * these 'compile-time allocated' memory buffers are - * fixed-size 4k pages. (or larger if used with an increment - * highger than 1) use fixmap_set(idx,phys) to associate - * physical memory with fixmap indices. - * - * TLB entries of such buffers will not be flushed across - * task switches. - */ - -/* - * on UP currently we will have no trace of the fixmap mechanizm, - * no page table allocations, etc. This might change in the - * future, say framebuffers for the console driver(s) could be - * fix-mapped? - */ -enum fixed_addresses { - __end_of_fixed_addresses -}; - -extern void __set_fixmap (enum fixed_addresses idx, - unsigned long phys, pgprot_t flags); - -/* - * used by vmalloc.c. - * - * Leave one empty page between vmalloc'ed areas and - * the start of the fixmap, and leave one page empty - * at the top of mem.. - */ - -#define FIXADDR_TOP (TASK_SIZE - 2 * PAGE_SIZE) -#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) - -#include <asm-generic/fixmap.h> - -#endif diff --git a/arch/um/include/asm/kasan.h b/arch/um/include/asm/kasan.h index 0d6547f4ec85..f97bb1f7b851 100644 --- a/arch/um/include/asm/kasan.h +++ b/arch/um/include/asm/kasan.h @@ -24,7 +24,6 @@ #ifdef CONFIG_KASAN void kasan_init(void); -void kasan_map_memory(void *start, unsigned long len); extern int kasan_um_is_ready; #ifdef CONFIG_STATIC_LINK diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h index a7555e43ed14..a3eaca41ff61 100644 --- a/arch/um/include/asm/mmu.h +++ b/arch/um/include/asm/mmu.h @@ -7,17 +7,13 @@ #define __ARCH_UM_MMU_H #include <mm_id.h> -#include <asm/mm_context.h> typedef struct mm_context { struct mm_id id; - struct uml_arch_mm_context arch; -} mm_context_t; - -extern void __switch_mm(struct mm_id * mm_idp); -/* Avoid tangled inclusion with asm/ldt.h */ -extern long init_new_ldt(struct mm_context *to_mm, struct mm_context *from_mm); -extern void free_ldt(struct mm_context *mm); + /* Address range in need of a TLB sync */ + unsigned long sync_tlb_range_from; + unsigned long sync_tlb_range_to; +} mm_context_t; #endif diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h index 68e2eb9cfb47..23dcc914d44e 100644 --- a/arch/um/include/asm/mmu_context.h +++ b/arch/um/include/asm/mmu_context.h @@ -13,8 +13,6 @@ #include <asm/mm_hooks.h> #include <asm/mmu.h> -extern void force_flush_all(void); - #define activate_mm activate_mm static inline void activate_mm(struct mm_struct *old, struct mm_struct *new) { diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index 9ef9a8aedfa6..3d516f3ca9c7 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h @@ -9,10 +9,7 @@ #include <linux/const.h> -/* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT CONFIG_PAGE_SHIFT -#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) +#include <vdso/page.h> #ifndef __ASSEMBLY__ @@ -32,51 +29,35 @@ struct page; #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#if defined(CONFIG_3_LEVEL_PGTABLES) && !defined(CONFIG_64BIT) - typedef struct { unsigned long pte; } pte_t; -typedef struct { unsigned long pmd; } pmd_t; typedef struct { unsigned long pgd; } pgd_t; -#define pte_val(p) ((p).pte) -#define pte_get_bits(p, bits) ((p).pte & (bits)) -#define pte_set_bits(p, bits) ((p).pte |= (bits)) -#define pte_clear_bits(p, bits) ((p).pte &= ~(bits)) -#define pte_copy(to, from) ({ (to).pte = (from).pte; }) -#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE)) -#define pte_set_val(p, phys, prot) \ - ({ (p).pte = (phys) | pgprot_val(prot); }) +#if CONFIG_PGTABLE_LEVELS > 2 +typedef struct { unsigned long pmd; } pmd_t; #define pmd_val(x) ((x).pmd) #define __pmd(x) ((pmd_t) { (x) } ) -typedef unsigned long long phys_t; +#if CONFIG_PGTABLE_LEVELS > 3 -#else - -typedef struct { unsigned long pte; } pte_t; -typedef struct { unsigned long pgd; } pgd_t; +typedef struct { unsigned long pud; } pud_t; +#define pud_val(x) ((x).pud) +#define __pud(x) ((pud_t) { (x) } ) -#ifdef CONFIG_3_LEVEL_PGTABLES -typedef struct { unsigned long pmd; } pmd_t; -#define pmd_val(x) ((x).pmd) -#define __pmd(x) ((pmd_t) { (x) } ) -#endif +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ #define pte_val(x) ((x).pte) - #define pte_get_bits(p, bits) ((p).pte & (bits)) #define pte_set_bits(p, bits) ((p).pte |= (bits)) #define pte_clear_bits(p, bits) ((p).pte &= ~(bits)) #define pte_copy(to, from) ((to).pte = (from).pte) -#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE)) +#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEEDSYNC)) #define pte_set_val(p, phys, prot) (p).pte = (phys | pgprot_val(prot)) typedef unsigned long phys_t; -#endif - typedef struct { unsigned long pgprot; } pgprot_t; typedef struct page *pgtable_t; diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index de5e31c64793..826ec44b58cd 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -25,20 +25,20 @@ */ extern pgd_t *pgd_alloc(struct mm_struct *); -#define __pte_free_tlb(tlb, pte, address) \ -do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ - tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ -} while (0) +#define __pte_free_tlb(tlb, pte, address) \ + tlb_remove_ptdesc((tlb), page_ptdesc(pte)) -#ifdef CONFIG_3_LEVEL_PGTABLES +#if CONFIG_PGTABLE_LEVELS > 2 -#define __pmd_free_tlb(tlb, pmd, address) \ -do { \ - pagetable_pmd_dtor(virt_to_ptdesc(pmd)); \ - tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \ -} while (0) +#define __pmd_free_tlb(tlb, pmd, address) \ + tlb_remove_ptdesc((tlb), virt_to_ptdesc(pmd)) +#if CONFIG_PGTABLE_LEVELS > 3 + +#define __pud_free_tlb(tlb, pud, address) \ + tlb_remove_ptdesc((tlb), virt_to_ptdesc(pud)) + +#endif #endif #endif diff --git a/arch/um/include/asm/pgtable-2level.h b/arch/um/include/asm/pgtable-2level.h index 8256ecc5b919..ab0c8dd86564 100644 --- a/arch/um/include/asm/pgtable-2level.h +++ b/arch/um/include/asm/pgtable-2level.h @@ -31,7 +31,7 @@ printk("%s:%d: bad pgd %p(%08lx).\n", __FILE__, __LINE__, &(e), \ pgd_val(e)) -static inline int pgd_newpage(pgd_t pgd) { return 0; } +static inline int pgd_needsync(pgd_t pgd) { return 0; } static inline void pgd_mkuptodate(pgd_t pgd) { } #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-4level.h index 8a5032ec231f..0d279caee93c 100644 --- a/arch/um/include/asm/pgtable-3level.h +++ b/arch/um/include/asm/pgtable-4level.h @@ -4,21 +4,25 @@ * Derived from include/asm-i386/pgtable.h */ -#ifndef __UM_PGTABLE_3LEVEL_H -#define __UM_PGTABLE_3LEVEL_H +#ifndef __UM_PGTABLE_4LEVEL_H +#define __UM_PGTABLE_4LEVEL_H -#include <asm-generic/pgtable-nopud.h> +#include <asm-generic/pgtable-nop4d.h> -/* PGDIR_SHIFT determines what a third-level page table entry can map */ +/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ -#ifdef CONFIG_64BIT -#define PGDIR_SHIFT 30 -#else -#define PGDIR_SHIFT 31 -#endif +#define PGDIR_SHIFT 39 #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) +/* PUD_SHIFT determines the size of the area a third-level page table can + * map + */ + +#define PUD_SHIFT 30 +#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE-1)) + /* PMD_SHIFT determines the size of the area a second-level page table can * map */ @@ -32,13 +36,9 @@ */ #define PTRS_PER_PTE 512 -#ifdef CONFIG_64BIT #define PTRS_PER_PMD 512 +#define PTRS_PER_PUD 512 #define PTRS_PER_PGD 512 -#else -#define PTRS_PER_PMD 1024 -#define PTRS_PER_PGD 1024 -#endif #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) @@ -48,11 +48,14 @@ #define pmd_ERROR(e) \ printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), \ pmd_val(e)) +#define pud_ERROR(e) \ + printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), \ + pud_val(e)) #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), \ pgd_val(e)) -#define pud_none(x) (!(pud_val(x) & ~_PAGE_NEWPAGE)) +#define pud_none(x) (!(pud_val(x) & ~_PAGE_NEEDSYNC)) #define pud_bad(x) ((pud_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pud_present(x) (pud_val(x) & _PAGE_PRESENT) #define pud_populate(mm, pud, pmd) \ @@ -60,23 +63,40 @@ #define set_pud(pudptr, pudval) (*(pudptr) = (pudval)) -static inline int pgd_newpage(pgd_t pgd) +#define p4d_none(x) (!(p4d_val(x) & ~_PAGE_NEEDSYNC)) +#define p4d_bad(x) ((p4d_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define p4d_present(x) (p4d_val(x) & _PAGE_PRESENT) +#define p4d_populate(mm, p4d, pud) \ + set_p4d(p4d, __p4d(_PAGE_TABLE + __pa(pud))) + +#define set_p4d(p4dptr, p4dval) (*(p4dptr) = (p4dval)) + + +static inline int pgd_needsync(pgd_t pgd) { - return(pgd_val(pgd) & _PAGE_NEWPAGE); + return pgd_val(pgd) & _PAGE_NEEDSYNC; } -static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEWPAGE; } +static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEEDSYNC; } #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) static inline void pud_clear (pud_t *pud) { - set_pud(pud, __pud(_PAGE_NEWPAGE)); + set_pud(pud, __pud(_PAGE_NEEDSYNC)); +} + +static inline void p4d_clear (p4d_t *p4d) +{ + set_p4d(p4d, __p4d(_PAGE_NEEDSYNC)); } #define pud_page(pud) phys_to_page(pud_val(pud) & PAGE_MASK) #define pud_pgtable(pud) ((pmd_t *) __va(pud_val(pud) & PAGE_MASK)) +#define p4d_page(p4d) phys_to_page(p4d_val(p4d) & PAGE_MASK) +#define p4d_pgtable(p4d) ((pud_t *) __va(p4d_val(p4d) & PAGE_MASK)) + static inline unsigned long pte_pfn(pte_t pte) { return phys_to_pfn(pte_val(pte)); @@ -97,4 +117,3 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) } #endif - diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index e1ece21dbe3f..5601ca98e8a6 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -8,11 +8,11 @@ #ifndef __UM_PGTABLE_H #define __UM_PGTABLE_H -#include <asm/fixmap.h> +#include <asm/page.h> +#include <linux/mm_types.h> #define _PAGE_PRESENT 0x001 -#define _PAGE_NEWPAGE 0x002 -#define _PAGE_NEWPROT 0x004 +#define _PAGE_NEEDSYNC 0x002 #define _PAGE_RW 0x020 #define _PAGE_USER 0x040 #define _PAGE_ACCESSED 0x080 @@ -24,10 +24,12 @@ /* We borrow bit 10 to store the exclusive marker in swap PTEs. */ #define _PAGE_SWP_EXCLUSIVE 0x400 -#ifdef CONFIG_3_LEVEL_PGTABLES -#include <asm/pgtable-3level.h> -#else +#if CONFIG_PGTABLE_LEVELS == 4 +#include <asm/pgtable-4level.h> +#elif CONFIG_PGTABLE_LEVELS == 2 #include <asm/pgtable-2level.h> +#else +#error "Unsupported number of page table levels" #endif extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; @@ -47,11 +49,9 @@ extern unsigned long end_iomem; #define VMALLOC_OFFSET (__va_space) #define VMALLOC_START ((end_iomem + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)) -#define PKMAP_BASE ((FIXADDR_START - LAST_PKMAP * PAGE_SIZE) & PMD_MASK) -#define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) +#define VMALLOC_END (TASK_SIZE-2*PAGE_SIZE) #define MODULES_VADDR VMALLOC_START #define MODULES_END VMALLOC_END -#define MODULES_LEN (MODULES_VADDR - MODULES_END) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) @@ -78,22 +78,22 @@ extern unsigned long end_iomem; */ #define ZERO_PAGE(vaddr) virt_to_page(empty_zero_page) -#define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE)) +#define pte_clear(mm, addr, xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEEDSYNC)) -#define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE)) +#define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEEDSYNC)) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) -#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) +#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEEDSYNC; } while (0) -#define pmd_newpage(x) (pmd_val(x) & _PAGE_NEWPAGE) -#define pmd_mkuptodate(x) (pmd_val(x) &= ~_PAGE_NEWPAGE) +#define pmd_needsync(x) (pmd_val(x) & _PAGE_NEEDSYNC) +#define pmd_mkuptodate(x) (pmd_val(x) &= ~_PAGE_NEEDSYNC) -#define pud_newpage(x) (pud_val(x) & _PAGE_NEWPAGE) -#define pud_mkuptodate(x) (pud_val(x) &= ~_PAGE_NEWPAGE) +#define pud_needsync(x) (pud_val(x) & _PAGE_NEEDSYNC) +#define pud_mkuptodate(x) (pud_val(x) &= ~_PAGE_NEEDSYNC) -#define p4d_newpage(x) (p4d_val(x) & _PAGE_NEWPAGE) -#define p4d_mkuptodate(x) (p4d_val(x) &= ~_PAGE_NEWPAGE) +#define p4d_needsync(x) (p4d_val(x) & _PAGE_NEEDSYNC) +#define p4d_mkuptodate(x) (p4d_val(x) &= ~_PAGE_NEEDSYNC) #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) #define pmd_page(pmd) phys_to_page(pmd_val(pmd) & PAGE_MASK) @@ -144,14 +144,9 @@ static inline int pte_young(pte_t pte) return pte_get_bits(pte, _PAGE_ACCESSED); } -static inline int pte_newpage(pte_t pte) -{ - return pte_get_bits(pte, _PAGE_NEWPAGE); -} - -static inline int pte_newprot(pte_t pte) +static inline int pte_needsync(pte_t pte) { - return(pte_present(pte) && (pte_get_bits(pte, _PAGE_NEWPROT))); + return pte_get_bits(pte, _PAGE_NEEDSYNC); } /* @@ -160,12 +155,6 @@ static inline int pte_newprot(pte_t pte) * ================================= */ -static inline pte_t pte_mknewprot(pte_t pte) -{ - pte_set_bits(pte, _PAGE_NEWPROT); - return(pte); -} - static inline pte_t pte_mkclean(pte_t pte) { pte_clear_bits(pte, _PAGE_DIRTY); @@ -180,19 +169,14 @@ static inline pte_t pte_mkold(pte_t pte) static inline pte_t pte_wrprotect(pte_t pte) { - if (likely(pte_get_bits(pte, _PAGE_RW))) - pte_clear_bits(pte, _PAGE_RW); - else - return pte; - return(pte_mknewprot(pte)); + pte_clear_bits(pte, _PAGE_RW); + return pte; } static inline pte_t pte_mkread(pte_t pte) { - if (unlikely(pte_get_bits(pte, _PAGE_USER))) - return pte; pte_set_bits(pte, _PAGE_USER); - return(pte_mknewprot(pte)); + return pte; } static inline pte_t pte_mkdirty(pte_t pte) @@ -209,23 +193,19 @@ static inline pte_t pte_mkyoung(pte_t pte) static inline pte_t pte_mkwrite_novma(pte_t pte) { - if (unlikely(pte_get_bits(pte, _PAGE_RW))) - return pte; pte_set_bits(pte, _PAGE_RW); - return(pte_mknewprot(pte)); + return pte; } static inline pte_t pte_mkuptodate(pte_t pte) { - pte_clear_bits(pte, _PAGE_NEWPAGE); - if(pte_present(pte)) - pte_clear_bits(pte, _PAGE_NEWPROT); - return(pte); + pte_clear_bits(pte, _PAGE_NEEDSYNC); + return pte; } -static inline pte_t pte_mknewpage(pte_t pte) +static inline pte_t pte_mkneedsync(pte_t pte) { - pte_set_bits(pte, _PAGE_NEWPAGE); + pte_set_bits(pte, _PAGE_NEEDSYNC); return(pte); } @@ -233,21 +213,51 @@ static inline void set_pte(pte_t *pteptr, pte_t pteval) { pte_copy(*pteptr, pteval); - /* If it's a swap entry, it needs to be marked _PAGE_NEWPAGE so - * fix_range knows to unmap it. _PAGE_NEWPROT is specific to - * mapped pages. + /* If it's a swap entry, it needs to be marked _PAGE_NEEDSYNC so + * update_pte_range knows to unmap it. */ - *pteptr = pte_mknewpage(*pteptr); - if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr); + *pteptr = pte_mkneedsync(*pteptr); } #define PFN_PTE_SHIFT PAGE_SHIFT +static inline void um_tlb_mark_sync(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + if (!mm->context.sync_tlb_range_to) { + mm->context.sync_tlb_range_from = start; + mm->context.sync_tlb_range_to = end; + } else { + if (start < mm->context.sync_tlb_range_from) + mm->context.sync_tlb_range_from = start; + if (end > mm->context.sync_tlb_range_to) + mm->context.sync_tlb_range_to = end; + } +} + +#define set_ptes set_ptes +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, int nr) +{ + /* Basically the default implementation */ + size_t length = nr * PAGE_SIZE; + + for (;;) { + set_pte(ptep, pte); + if (--nr == 0) + break; + ptep++; + pte = __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT)); + } + + um_tlb_mark_sync(mm, addr, addr + length); +} + #define __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { - return !((pte_val(pte_a) ^ pte_val(pte_b)) & ~_PAGE_NEWPAGE); + return !((pte_val(pte_a) ^ pte_val(pte_b)) & ~_PAGE_NEEDSYNC); } /* @@ -255,17 +265,13 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b) * and a page entry and page directory to the page they refer to. */ -#define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys)) #define __virt_to_page(virt) phys_to_page(__pa(virt)) -#define page_to_phys(page) pfn_to_phys(page_to_pfn(page)) #define virt_to_page(addr) __virt_to_page((const unsigned long) addr) #define mk_pte(page, pgprot) \ ({ pte_t pte; \ \ pte_set_val(pte, page_to_phys(page), (pgprot)); \ - if (pte_present(pte)) \ - pte_mknewprot(pte_mknewpage(pte)); \ pte;}) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) @@ -299,7 +305,7 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); * <--------------- offset ----------------> E < type -> 0 0 0 1 0 * * E is the exclusive marker that is not stored in swap entries. - * _PAGE_NEWPAGE (bit 1) is always set to 1 in set_pte(). + * _PAGE_NEEDSYNC (bit 1) is always set to 1 in set_pte(). */ #define __swp_type(x) (((x).val >> 5) & 0x1f) #define __swp_offset(x) ((x).val >> 11) @@ -327,11 +333,4 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return pte; } -/* Clear a kernel PTE and flush it from the TLB */ -#define kpte_clear_flush(ptep, vaddr) \ -do { \ - pte_clear(&init_mm, (vaddr), (ptep)); \ - __flush_tlb_one((vaddr)); \ -} while (0) - #endif diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index 6c3779541845..8a789c17acd8 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -20,38 +20,29 @@ struct task_struct; struct mm_struct; struct thread_struct { - struct pt_regs regs; struct pt_regs *segv_regs; - void *fault_addr; - jmp_buf *fault_catcher; struct task_struct *prev_sched; struct arch_thread arch; jmp_buf switch_buf; struct { - int op; - union { - struct { - int pid; - } fork, exec; - struct { - int (*proc)(void *); - void *arg; - } thread; - struct { - void (*proc)(void *); - void *arg; - } cb; - } u; + struct { + int (*proc)(void *); + void *arg; + } thread; } request; + + void *segv_continue; + + /* Contains variable sized FP registers */ + struct pt_regs regs; }; #define INIT_THREAD \ { \ .regs = EMPTY_REGS, \ - .fault_addr = NULL, \ .prev_sched = NULL, \ .arch = INIT_ARCH_THREAD, \ - .request = { 0 } \ + .request = { } \ } /* @@ -94,7 +85,6 @@ extern struct cpuinfo_um boot_cpu_data; #define current_cpu_data boot_cpu_data #define cache_line_size() (boot_cpu_data.cache_alignment) -extern unsigned long get_thread_reg(int reg, jmp_buf *buf); #define KSTK_REG(tsk, reg) get_thread_reg(reg, &tsk->thread.switch_buf) extern unsigned long __get_wchan(struct task_struct *p); diff --git a/arch/um/include/asm/ptrace-generic.h b/arch/um/include/asm/ptrace-generic.h index adf91ef553ae..4696f24d1492 100644 --- a/arch/um/include/asm/ptrace-generic.h +++ b/arch/um/include/asm/ptrace-generic.h @@ -36,6 +36,9 @@ extern long subarch_ptrace(struct task_struct *child, long request, extern unsigned long getreg(struct task_struct *child, int regno); extern int putreg(struct task_struct *child, int regno, unsigned long value); +extern int poke_user(struct task_struct *child, long addr, long data); +extern int peek_user(struct task_struct *child, long addr, long data); + extern int arch_set_tls(struct task_struct *new, unsigned long tls); extern void clear_flushed_tls(struct task_struct *task); extern int syscall_trace_enter(struct pt_regs *regs); diff --git a/arch/um/include/asm/sysrq.h b/arch/um/include/asm/sysrq.h deleted file mode 100644 index 8fc8c65cd357..000000000000 --- a/arch/um/include/asm/sysrq.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __UM_SYSRQ_H -#define __UM_SYSRQ_H - -struct task_struct; -extern void show_trace(struct task_struct* task, unsigned long *stack); - -#endif diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index c7b4b49826a2..f9ad06fcc991 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -17,35 +17,17 @@ #include <sysdep/ptrace_user.h> struct thread_info { - struct task_struct *task; /* main task structure */ unsigned long flags; /* low level flags */ __u32 cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, <0 => BUG */ - struct thread_info *real_thread; /* Points to non-IRQ stack */ - unsigned long aux_fp_regs[FP_SIZE]; /* auxiliary fp_regs to save/restore - them out-of-band */ }; #define INIT_THREAD_INFO(tsk) \ { \ - .task = &tsk, \ .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .real_thread = NULL, \ -} - -/* how to get the thread information struct from C */ -static inline struct thread_info *current_thread_info(void) -{ - struct thread_info *ti; - unsigned long mask = THREAD_SIZE - 1; - void *p; - - asm volatile ("" : "=r" (p) : "0" (&ti)); - ti = (struct thread_info *) (((unsigned long)p) & ~mask); - return ti; } #endif diff --git a/arch/um/include/asm/tlbflush.h b/arch/um/include/asm/tlbflush.h index a5bda890390d..13a3009942be 100644 --- a/arch/um/include/asm/tlbflush.h +++ b/arch/um/include/asm/tlbflush.h @@ -9,23 +9,51 @@ #include <linux/mm.h> /* - * TLB flushing: + * In UML, we need to sync the TLB over by using mmap/munmap syscalls from + * the process handling the MM (which can be the kernel itself). + * + * To track updates, we can hook into set_ptes and flush_tlb_*. With set_ptes + * we catch all PTE transitions where memory that was unusable becomes usable. + * While with flush_tlb_* we can track any memory that becomes unusable and + * even if a higher layer of the page table was modified. + * + * So, we simply track updates using both methods and mark the memory area to + * be synced later on. The only special case is that flush_tlb_kern_* needs to + * be executed immediately as there is no good synchronization point in that + * case. In contrast, in the set_ptes case we can wait for the next kernel + * segfault before we do the synchornization. * - * - flush_tlb() flushes the current mm struct TLBs * - flush_tlb_all() flushes all processes TLBs * - flush_tlb_mm(mm) flushes the specified mm context TLB's * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_kernel_vm() flushes the kernel vm area * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages */ +extern int um_tlb_sync(struct mm_struct *mm); + extern void flush_tlb_all(void); extern void flush_tlb_mm(struct mm_struct *mm); -extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end); -extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long address); -extern void flush_tlb_kernel_vm(void); -extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); -extern void __flush_tlb_one(unsigned long addr); + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long address) +{ + um_tlb_mark_sync(vma->vm_mm, address, address + PAGE_SIZE); +} + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + um_tlb_mark_sync(vma->vm_mm, start, end); +} + +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + um_tlb_mark_sync(&init_mm, start, end); + + /* Kernel needs to be synced immediately */ + um_tlb_sync(&init_mm); +} #endif diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h index 7d9d60e41e4e..3a08f9029a3f 100644 --- a/arch/um/include/asm/uaccess.h +++ b/arch/um/include/asm/uaccess.h @@ -8,7 +8,8 @@ #define __UM_UACCESS_H #include <asm/elf.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> +#include <sysdep/faultinfo.h> #define __under_task_size(addr, size) \ (((unsigned long) (addr) < TASK_SIZE) && \ @@ -44,19 +45,28 @@ static inline int __access_ok(const void __user *ptr, unsigned long size) __access_ok_vsyscall(addr, size)); } -/* no pagefaults for kernel addresses in um */ #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ - *((type *)dst) = get_unaligned((type *)(src)); \ - if (0) /* make sure the label looks used to the compiler */ \ + int __faulted; \ + \ + ___backtrack_faulted(__faulted); \ + if (__faulted) { \ + *((type *)dst) = (type) 0; \ goto err_label; \ + } \ + *((type *)dst) = get_unaligned((type *)(src)); \ + current->thread.segv_continue = NULL; \ } while (0) #define __put_kernel_nofault(dst, src, type, err_label) \ do { \ - put_unaligned(*((type *)src), (type *)(dst)); \ - if (0) /* make sure the label looks used to the compiler */ \ + int __faulted; \ + \ + ___backtrack_faulted(__faulted); \ + if (__faulted) \ goto err_label; \ + put_unaligned(*((type *)src), (type *)(dst)); \ + current->thread.segv_continue = NULL; \ } while (0) #endif diff --git a/arch/um/include/linux/time-internal.h b/arch/um/include/linux/time-internal.h index b22226634ff6..138908b999d7 100644 --- a/arch/um/include/linux/time-internal.h +++ b/arch/um/include/linux/time-internal.h @@ -83,6 +83,8 @@ extern void time_travel_not_configured(void); #define time_travel_del_event(...) time_travel_not_configured() #endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ +extern unsigned long tt_extra_sched_jiffies; + /* * Without CONFIG_UML_TIME_TRAVEL_SUPPORT this is a linker error if used, * which is intentional since we really shouldn't link it in that case. diff --git a/arch/um/include/shared/arch.h b/arch/um/include/shared/arch.h index 880ee42a3329..cc398a21ad96 100644 --- a/arch/um/include/shared/arch.h +++ b/arch/um/include/shared/arch.h @@ -12,4 +12,6 @@ extern void arch_check_bugs(void); extern int arch_fixup(unsigned long address, struct uml_pt_regs *regs); extern void arch_examine_signal(int sig, struct uml_pt_regs *regs); +void mc_set_rip(void *_mc, void *target); + #endif diff --git a/arch/um/include/shared/as-layout.h b/arch/um/include/shared/as-layout.h index 9ec3015bc5e2..4f44dcce8a7c 100644 --- a/arch/um/include/shared/as-layout.h +++ b/arch/um/include/shared/as-layout.h @@ -23,37 +23,34 @@ #define STUB_START stub_start #define STUB_CODE STUB_START #define STUB_DATA (STUB_CODE + UM_KERN_PAGE_SIZE) -#define STUB_DATA_PAGES 1 /* must be a power of two */ +#define STUB_DATA_PAGES 2 /* must be a power of two */ #define STUB_END (STUB_DATA + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE) #ifndef __ASSEMBLY__ #include <sysdep/ptrace.h> -struct cpu_task { - int pid; - void *task; -}; +struct task_struct; +extern struct task_struct *cpu_tasks[]; -extern struct cpu_task cpu_tasks[]; +extern unsigned long long physmem_size; extern unsigned long high_physmem; extern unsigned long uml_physmem; extern unsigned long uml_reserved; extern unsigned long end_vm; extern unsigned long start_vm; -extern unsigned long long highmem; extern unsigned long brk_start; extern unsigned long host_task_size; extern unsigned long stub_start; -extern int linux_main(int argc, char **argv); +extern int linux_main(int argc, char **argv, char **envp); extern void uml_finishsetup(void); struct siginfo; -extern void (*sig_info[])(int, struct siginfo *si, struct uml_pt_regs *); +extern void (*sig_info[])(int, struct siginfo *si, struct uml_pt_regs *, void *); #endif diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index 96195483fbd0..73f3a4792ed8 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h @@ -1,13 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* for use by sys-$SUBARCH/kernel-offsets.c */ -#include <stub-data.h> DEFINE(KERNEL_MADV_REMOVE, MADV_REMOVE); DEFINE(UM_KERN_PAGE_SIZE, PAGE_SIZE); DEFINE(UM_KERN_PAGE_MASK, PAGE_MASK); DEFINE(UM_KERN_PAGE_SHIFT, PAGE_SHIFT); -DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC); DEFINE(UM_GFP_KERNEL, GFP_KERNEL); DEFINE(UM_GFP_ATOMIC, GFP_ATOMIC); @@ -16,21 +14,3 @@ DEFINE(UM_THREAD_SIZE, THREAD_SIZE); DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC); DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC); - -#ifdef CONFIG_PRINTK -DEFINE(UML_CONFIG_PRINTK, CONFIG_PRINTK); -#endif -#ifdef CONFIG_UML_X86 -DEFINE(UML_CONFIG_UML_X86, CONFIG_UML_X86); -#endif -#ifdef CONFIG_64BIT -DEFINE(UML_CONFIG_64BIT, CONFIG_64BIT); -#endif -#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT -DEFINE(UML_CONFIG_UML_TIME_TRAVEL_SUPPORT, CONFIG_UML_TIME_TRAVEL_SUPPORT); -#endif - -/* for stub */ -DEFINE(UML_STUB_FIELD_OFFSET, offsetof(struct stub_data, offset)); -DEFINE(UML_STUB_FIELD_CHILD_ERR, offsetof(struct stub_data, child_err)); -DEFINE(UML_STUB_FIELD_FD, offsetof(struct stub_data, fd)); diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h index da0f6eea30d0..88835b52ae2b 100644 --- a/arch/um/include/shared/irq_user.h +++ b/arch/um/include/shared/irq_user.h @@ -15,7 +15,8 @@ enum um_irq_type { }; struct siginfo; -extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); +extern void sigio_handler(int sig, struct siginfo *unused_si, + struct uml_pt_regs *regs, void *mc); void sigio_run_timetravel_handlers(void); extern void free_irq_by_fd(int fd); extern void deactivate_fd(int fd, int irqnum); diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index 789b83013f35..00ca3e12fd9a 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -13,7 +13,6 @@ struct siginfo; extern int uml_exitcode; -extern int ncpus; extern int kmalloc_ok; #define UML_ROUND_UP(addr) \ @@ -25,10 +24,12 @@ extern void free_stack(unsigned long stack, int order); struct pt_regs; extern void do_signal(struct pt_regs *regs); extern void interrupt_end(void); -extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs); +extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs, + void *mc); extern unsigned long segv(struct faultinfo fi, unsigned long ip, - int is_user, struct uml_pt_regs *regs); + int is_user, struct uml_pt_regs *regs, + void *mc); extern int handle_page_fault(unsigned long address, unsigned long ip, int is_write, int is_user, int *code_out); @@ -41,6 +42,7 @@ extern void uml_pm_wake(void); extern int start_uml(void); extern void paging_init(void); +extern int parse_iomem(char *str, int *add); extern void uml_cleanup(void); extern void do_uml_exitcalls(void); @@ -59,11 +61,14 @@ extern unsigned long from_irq_stack(int nested); extern int singlestepping(void); -extern void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); -extern void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs); -extern void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); +extern void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc); +extern void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc); extern void fatal_sigsegv(void) __attribute__ ((noreturn)); void um_idle_sleep(void); +void kasan_map_memory(void *start, size_t len); + #endif diff --git a/arch/um/include/shared/mem_user.h b/arch/um/include/shared/mem_user.h index 11a723a58545..d4727efcf23d 100644 --- a/arch/um/include/shared/mem_user.h +++ b/arch/um/include/shared/mem_user.h @@ -47,10 +47,8 @@ extern int iomem_size; #define ROUND_4M(n) ((((unsigned long) (n)) + (1 << 22)) & ~((1 << 22) - 1)) extern unsigned long find_iomem(char *driver, unsigned long *len_out); -extern void mem_total_pages(unsigned long physmem, unsigned long iomem, - unsigned long highmem); extern void setup_physmem(unsigned long start, unsigned long usable, - unsigned long len, unsigned long long highmem); + unsigned long len); extern void map_memory(unsigned long virt, unsigned long phys, unsigned long len, int r, int w, int x); diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index aff8906304ea..152a60080d5b 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -145,7 +145,6 @@ extern int os_ioctl_generic(int fd, unsigned int cmd, unsigned long arg); extern int os_get_ifname(int fd, char *namebuf); extern int os_set_slip(int fd); extern int os_mode_fd(int fd, int mode); -extern int os_fsync_file(int fd); extern int os_seek_file(int fd, unsigned long long offset); extern int os_open_file(const char *file, struct openflags flags, int mode); @@ -163,8 +162,10 @@ extern int os_set_fd_block(int fd, int blocking); extern int os_accept_connection(int fd); extern int os_create_unix_socket(const char *file, int len, int close_on_exec); extern int os_shutdown_socket(int fd, int r, int w); +extern int os_dup_file(int fd); extern void os_close_file(int fd); -extern int os_rcv_fd(int fd, int *helper_pid_out); +ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds, + void *data, size_t data_len); extern int os_connect_socket(const char *name); extern int os_file_type(char *file); extern int os_file_mode(const char *file, struct openflags *mode_out); @@ -179,6 +180,8 @@ extern int os_eventfd(unsigned int initval, int flags); extern int os_sendmsg_fds(int fd, const void *buf, unsigned int len, const int *fds, unsigned int fds_num); int os_poll(unsigned int n, const int *fds); +void *os_mmap_rw_shared(int fd, size_t size); +void *os_mremap_rw_shared(void *old_addr, size_t old_size, size_t new_size); /* start_up.c */ extern void os_early_checks(void); @@ -191,16 +194,15 @@ extern void get_host_cpu_features( /* mem.c */ extern int create_mem_file(unsigned long long len); +/* tlb.c */ +extern void report_enomem(void); + /* process.c */ -extern unsigned long os_process_pc(int pid); -extern int os_process_parent(int pid); extern void os_alarm_process(int pid); -extern void os_stop_process(int pid); extern void os_kill_process(int pid, int reap_child); extern void os_kill_ptraced_process(int pid, int reap_child); extern int os_getpid(void); -extern int os_getpgrp(void); extern void init_new_thread_signals(void); @@ -211,7 +213,8 @@ extern int os_protect_memory(void *addr, unsigned long len, extern int os_unmap_memory(void *addr, int len); extern int os_drop_memory(void *addr, int length); extern int can_drop_memory(void); -extern int os_mincore(void *addr, unsigned long len); + +void os_set_pdeathsig(void); /* execvp.c */ extern int execvp_noalloc(char *buf, const char *file, char *const argv[]); @@ -221,6 +224,11 @@ extern int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, unsigned long *stack_out); extern int helper_wait(int pid); +struct os_helper_thread; +int os_run_helper_thread(struct os_helper_thread **td_out, + void *(*routine)(void *), void *arg); +void os_kill_helper_thread(struct os_helper_thread *td); +void os_fix_helper_thread_signals(void); /* umid.c */ extern int umid_file_name(char *name, char *buf, int len); @@ -237,7 +245,6 @@ extern void block_signals(void); extern void unblock_signals(void); extern int um_set_signals(int enable); extern int um_set_signals_trace(int enable); -extern int os_is_signal_stack(void); extern void deliver_alarm(void); extern void register_pm_wake_signal(void); extern void block_signals_hard(void); @@ -268,25 +275,19 @@ extern long long os_persistent_clock_emulation(void); extern long long os_nsecs(void); /* skas/mem.c */ -extern long run_syscall_stub(struct mm_id * mm_idp, - int syscall, unsigned long *args, long expected, - void **addr, int done); -extern long syscall_stub_data(struct mm_id * mm_idp, - unsigned long *data, int data_count, - void **addr, void **stub_addr); -extern int map(struct mm_id * mm_idp, unsigned long virt, - unsigned long len, int prot, int phys_fd, - unsigned long long offset, int done, void **data); -extern int unmap(struct mm_id * mm_idp, unsigned long addr, unsigned long len, - int done, void **data); -extern int protect(struct mm_id * mm_idp, unsigned long addr, - unsigned long len, unsigned int prot, int done, void **data); +int syscall_stub_flush(struct mm_id *mm_idp); +struct stub_syscall *syscall_stub_alloc(struct mm_id *mm_idp); +void syscall_stub_dump_error(struct mm_id *mm_idp); + +int map(struct mm_id *mm_idp, unsigned long virt, + unsigned long len, int prot, int phys_fd, + unsigned long long offset); +int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len); /* skas/process.c */ extern int is_skas_winch(int pid, int fd, void *data); extern int start_userspace(unsigned long stub_stack); -extern int copy_context_skas0(unsigned long stack, int pid); -extern void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs); +extern void userspace(struct uml_pt_regs *regs); extern void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)); extern void switch_threads(jmp_buf *me, jmp_buf *you); extern int start_idle_thread(void *stack, jmp_buf *switch_buf); @@ -313,7 +314,7 @@ extern void um_irqs_resume(void); extern int add_sigio_fd(int fd); extern int ignore_sigio_fd(int fd); extern void maybe_sigio_broken(int fd); -extern void sigio_broken(int fd); +extern void sigio_broken(void); /* * unlocked versions for IRQ controller code. * @@ -326,9 +327,6 @@ extern int __ignore_sigio_fd(int fd); /* tty.c */ extern int get_pty(void); -/* sys-$ARCH/task_size.c */ -extern unsigned long os_get_top_address(void); - long syscall(long number, ...); /* irqflags tracing */ diff --git a/arch/um/include/shared/registers.h b/arch/um/include/shared/registers.h index a0450326521c..7d81b2339a48 100644 --- a/arch/um/include/shared/registers.h +++ b/arch/um/include/shared/registers.h @@ -8,12 +8,6 @@ #include <sysdep/ptrace.h> -extern int save_i387_registers(int pid, unsigned long *fp_regs); -extern int restore_i387_registers(int pid, unsigned long *fp_regs); -extern int save_fp_registers(int pid, unsigned long *fp_regs); -extern int restore_fp_registers(int pid, unsigned long *fp_regs); -extern int save_fpx_registers(int pid, unsigned long *fp_regs); -extern int restore_fpx_registers(int pid, unsigned long *fp_regs); extern int init_pid_registers(int pid); extern void get_safe_registers(unsigned long *regs, unsigned long *fp_regs); extern int get_fp_registers(int pid, unsigned long *regs); diff --git a/arch/um/include/shared/sigio.h b/arch/um/include/shared/sigio.h index e60c8b227844..c6c2edce1f6d 100644 --- a/arch/um/include/shared/sigio.h +++ b/arch/um/include/shared/sigio.h @@ -6,7 +6,6 @@ #ifndef __SIGIO_H__ #define __SIGIO_H__ -extern int write_sigio_irq(int fd); extern void sigio_lock(void); extern void sigio_unlock(void); diff --git a/arch/um/include/shared/skas/mm_id.h b/arch/um/include/shared/skas/mm_id.h index e82e203f5f41..140388c282f6 100644 --- a/arch/um/include/shared/skas/mm_id.h +++ b/arch/um/include/shared/skas/mm_id.h @@ -7,12 +7,11 @@ #define __MM_ID_H struct mm_id { - union { - int mm_fd; - int pid; - } u; + int pid; unsigned long stack; - int kill; + int syscall_data_len; }; +void __switch_mm(struct mm_id *mm_idp); + #endif diff --git a/arch/um/include/shared/skas/skas.h b/arch/um/include/shared/skas/skas.h index c93d2cbc8f32..85c50122ab98 100644 --- a/arch/um/include/shared/skas/skas.h +++ b/arch/um/include/shared/skas/skas.h @@ -10,10 +10,10 @@ extern int userspace_pid[]; -extern int user_thread(unsigned long stack, int flags); extern void new_thread_handler(void); extern void handle_syscall(struct uml_pt_regs *regs); -extern long execute_syscall_skas(void *r); extern unsigned long current_stub_stack(void); +extern struct mm_id *current_mm_id(void); +extern void current_mm_sync(void); #endif diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h index 5e3ade3fb38b..81a4cace032c 100644 --- a/arch/um/include/shared/skas/stub-data.h +++ b/arch/um/include/shared/skas/stub-data.h @@ -8,10 +8,52 @@ #ifndef __STUB_DATA_H #define __STUB_DATA_H +#include <linux/compiler_types.h> +#include <as-layout.h> +#include <sysdep/tls.h> + +struct stub_init_data { + unsigned long stub_start; + + int stub_code_fd; + unsigned long stub_code_offset; + int stub_data_fd; + unsigned long stub_data_offset; + + unsigned long segv_handler; +}; + +#define STUB_NEXT_SYSCALL(s) \ + ((struct stub_syscall *) (((unsigned long) s) + (s)->cmd_len)) + +enum stub_syscall_type { + STUB_SYSCALL_UNSET = 0, + STUB_SYSCALL_MMAP, + STUB_SYSCALL_MUNMAP, +}; + +struct stub_syscall { + struct { + unsigned long addr; + unsigned long length; + unsigned long offset; + int fd; + int prot; + } mem; + + enum stub_syscall_type syscall; +}; + struct stub_data { unsigned long offset; - int fd; - long parent_err, child_err; + long err, child_err; + + int syscall_data_len; + /* 128 leaves enough room for additional fields in the struct */ + struct stub_syscall syscall_data[(UM_KERN_PAGE_SIZE - 128) / sizeof(struct stub_syscall)] __aligned(16); + + /* Stack for our signal handlers and for calling into . */ + unsigned char sigstack[UM_KERN_PAGE_SIZE] __aligned(UM_KERN_PAGE_SIZE); }; #endif diff --git a/arch/um/include/shared/timetravel.h b/arch/um/include/shared/timetravel.h index e5c3d69f1b69..7c2b277b7eb0 100644 --- a/arch/um/include/shared/timetravel.h +++ b/arch/um/include/shared/timetravel.h @@ -12,11 +12,19 @@ enum time_travel_mode { TT_MODE_EXTERNAL, }; -#if defined(UML_CONFIG_UML_TIME_TRAVEL_SUPPORT) || \ - defined(CONFIG_UML_TIME_TRAVEL_SUPPORT) +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) extern enum time_travel_mode time_travel_mode; +extern int time_travel_should_print_bc_msg; #else #define time_travel_mode TT_MODE_OFF -#endif /* (UML_)CONFIG_UML_TIME_TRAVEL_SUPPORT */ +#define time_travel_should_print_bc_msg 0 +#endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ + +void _time_travel_print_bc_msg(void); +static inline void time_travel_print_bc_msg(void) +{ + if (time_travel_should_print_bc_msg) + _time_travel_print_bc_msg(); +} #endif /* _UM_TIME_TRAVEL_H_ */ diff --git a/arch/um/include/shared/um_malloc.h b/arch/um/include/shared/um_malloc.h index 13da93284c2c..815dd03e8707 100644 --- a/arch/um/include/shared/um_malloc.h +++ b/arch/um/include/shared/um_malloc.h @@ -11,8 +11,9 @@ extern void *uml_kmalloc(int size, int flags); extern void kfree(const void *ptr); -extern void *vmalloc(unsigned long size); -extern void vfree(void *ptr); +extern void *vmalloc_noprof(unsigned long size); +#define vmalloc(...) vmalloc_noprof(__VA_ARGS__) +extern void vfree(const void *ptr); #endif /* __UM_MALLOC_H__ */ diff --git a/arch/um/include/shared/user.h b/arch/um/include/shared/user.h index 326e52450e41..139eb78a4767 100644 --- a/arch/um/include/shared/user.h +++ b/arch/um/include/shared/user.h @@ -38,15 +38,23 @@ extern void panic(const char *fmt, ...) #define UM_KERN_DEBUG KERN_DEBUG #define UM_KERN_CONT KERN_CONT -#ifdef UML_CONFIG_PRINTK +#if IS_ENABLED(CONFIG_PRINTK) #define printk(...) _printk(__VA_ARGS__) extern int _printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); +extern void print_hex_dump(const char *level, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, _Bool ascii); #else static inline int printk(const char *fmt, ...) { return 0; } +static inline void print_hex_dump(const char *level, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, _Bool ascii) +{ +} #endif extern int in_aton(char *str); diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 811188be954c..4df1cd0d2017 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -17,7 +17,7 @@ extra-y := vmlinux.lds obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \ physmem.o process.o ptrace.o reboot.o sigio.o \ signal.o sysrq.o time.o tlb.o trap.o \ - um_arch.o umid.o maccess.o kmsg_dump.o capflags.o skas/ + um_arch.o umid.o kmsg_dump.o capflags.o skas/ obj-y += load_file.o obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o @@ -47,7 +47,7 @@ $(obj)/config.c: $(src)/config.c.in $(obj)/config.tmp FORCE $(call if_changed,quote2) quiet_cmd_mkcapflags = MKCAP $@ - cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/../../x86/kernel/cpu/mkcapflags.sh $@ $^ + cmd_mkcapflags = $(CONFIG_SHELL) $(src)/../../x86/kernel/cpu/mkcapflags.sh $@ $^ cpufeature = $(src)/../../x86/include/asm/cpufeatures.h vmxfeature = $(src)/../../x86/include/asm/vmxfeatures.h diff --git a/arch/um/kernel/dtb.c b/arch/um/kernel/dtb.c index 4954188a6a09..15c342426489 100644 --- a/arch/um/kernel/dtb.c +++ b/arch/um/kernel/dtb.c @@ -17,7 +17,7 @@ void uml_dtb_init(void) area = uml_load_file(dtb, &size); if (area) { - if (!early_init_dt_scan(area)) { + if (!early_init_dt_scan(area, __pa(area))) { pr_err("invalid DTB %s\n", dtb); memblock_free(area, size); return; @@ -31,6 +31,7 @@ void uml_dtb_init(void) static int __init uml_dtb_setup(char *line, int *add) { + *add = 0; dtb = line; return 0; } diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index 3385d653ebd0..a36b7918a011 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -116,8 +116,6 @@ SECTIONS .fini_array : { *(.fini_array) } .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) - . = ALIGN(KERNEL_STACK_SIZE); - *(.data..init_irqstack) DATA_DATA *(.data.* .gnu.linkonce.d.*) SORT(CONSTRUCTORS) @@ -178,3 +176,6 @@ SECTIONS DISCARDS } + +ASSERT(__syscall_stub_end - __syscall_stub_start <= PAGE_SIZE, + "STUB code must not be larger than one page"); diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 827a0d3fa589..cb8b5cd9285c 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -22,17 +22,8 @@ void flush_thread(void) { - void *data = NULL; - int ret; - arch_flush_thread(¤t->thread.arch); - ret = unmap(¤t->mm->context.id, 0, TASK_SIZE, 1, &data); - if (ret) { - printk(KERN_ERR "%s - clearing address space failed, err = %d\n", - __func__, ret); - force_sig(SIGKILL); - } get_safe_registers(current_pt_regs()->regs.gp, current_pt_regs()->regs.fp); @@ -44,8 +35,5 @@ void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp) PT_REGS_IP(regs) = eip; PT_REGS_SP(regs) = esp; clear_thread_flag(TIF_SINGLESTEP); -#ifdef SUBARCH_EXECVE1 - SUBARCH_EXECVE1(regs->regs); -#endif } EXPORT_SYMBOL(start_thread); diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c index 47b8cb1a1156..99dba827461c 100644 --- a/arch/um/kernel/initrd.c +++ b/arch/um/kernel/initrd.c @@ -34,6 +34,7 @@ int __init read_initrd(void) static int __init uml_initrd_setup(char *line, int *add) { + *add = 0; initrd = line; return 0; } diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 635d44606bfe..abe8f30a521c 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -37,7 +37,7 @@ struct irq_reg { bool pending; bool wakeup; #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT - bool pending_on_resume; + bool pending_event; void (*timetravel_handler)(int, int, void *, struct time_travel_event *); struct time_travel_event event; @@ -52,10 +52,13 @@ struct irq_entry { bool sigio_workaround; }; -static DEFINE_SPINLOCK(irq_lock); +static DEFINE_RAW_SPINLOCK(irq_lock); static LIST_HEAD(active_fds); static DECLARE_BITMAP(irqs_allocated, UM_LAST_SIGNAL_IRQ); static bool irqs_suspended; +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +static bool irqs_pending; +#endif static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs) { @@ -84,9 +87,12 @@ static void irq_event_handler(struct time_travel_event *ev) { struct irq_reg *reg = container_of(ev, struct irq_reg, event); - /* do nothing if suspended - just to cause a wakeup */ - if (irqs_suspended) + /* do nothing if suspended; just cause a wakeup and mark as pending */ + if (irqs_suspended) { + irqs_pending = true; + reg->pending_event = true; return; + } generic_handle_irq(reg->irq); } @@ -110,16 +116,47 @@ static bool irq_do_timetravel_handler(struct irq_entry *entry, if (!reg->event.pending) return false; - if (irqs_suspended) - reg->pending_on_resume = true; return true; } + +static void irq_do_pending_events(bool timetravel_handlers_only) +{ + struct irq_entry *entry; + + if (!irqs_pending || timetravel_handlers_only) + return; + + irqs_pending = false; + + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + struct irq_reg *reg = &entry->reg[t]; + + /* + * Any timetravel_handler was invoked already, just + * directly run the IRQ. + */ + if (reg->pending_event) { + irq_enter(); + generic_handle_irq(reg->irq); + irq_exit(); + reg->pending_event = false; + } + } + } +} #else static bool irq_do_timetravel_handler(struct irq_entry *entry, enum um_irq_type t) { return false; } + +static void irq_do_pending_events(bool timetravel_handlers_only) +{ +} #endif static void sigio_reg_handler(int idx, struct irq_entry *entry, enum um_irq_type t, @@ -145,6 +182,8 @@ static void sigio_reg_handler(int idx, struct irq_entry *entry, enum um_irq_type */ if (timetravel_handlers_only) { #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + reg->pending_event = true; + irqs_pending = true; mark_sigio_pending(); #endif return; @@ -162,6 +201,10 @@ static void _sigio_handler(struct uml_pt_regs *regs, if (timetravel_handlers_only && !um_irq_timetravel_handler_used()) return; + /* Flush out pending events that were ignored due to time-travel. */ + if (!irqs_suspended) + irq_do_pending_events(timetravel_handlers_only); + while (1) { /* This is now lockless - epoll keeps back-referencesto the irqs * which have trigger it so there is no need to walk the irq @@ -193,9 +236,12 @@ static void _sigio_handler(struct uml_pt_regs *regs, free_irqs(); } -void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { + preempt_disable(); _sigio_handler(regs, irqs_suspended); + preempt_enable(); } static struct irq_entry *get_irq_entry_by_fd(int fd) @@ -212,7 +258,7 @@ static struct irq_entry *get_irq_entry_by_fd(int fd) return NULL; } -static void free_irq_entry(struct irq_entry *to_free, bool remove) +static void remove_irq_entry(struct irq_entry *to_free, bool remove) { if (!to_free) return; @@ -220,7 +266,6 @@ static void free_irq_entry(struct irq_entry *to_free, bool remove) if (remove) os_del_epoll_fd(to_free->fd); list_del(&to_free->list); - kfree(to_free); } static bool update_irq_entry(struct irq_entry *entry) @@ -241,17 +286,19 @@ static bool update_irq_entry(struct irq_entry *entry) return false; } -static void update_or_free_irq_entry(struct irq_entry *entry) +static struct irq_entry *update_or_remove_irq_entry(struct irq_entry *entry) { - if (!update_irq_entry(entry)) - free_irq_entry(entry, false); + if (update_irq_entry(entry)) + return NULL; + remove_irq_entry(entry, false); + return entry; } static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, void (*timetravel_handler)(int, int, void *, struct time_travel_event *)) { - struct irq_entry *irq_entry; + struct irq_entry *irq_entry, *to_free = NULL; int err, events = os_event_mask(type); unsigned long flags; @@ -259,9 +306,10 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, if (err < 0) goto out; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); irq_entry = get_irq_entry_by_fd(fd); if (irq_entry) { +already: /* cannot register the same FD twice with the same type */ if (WARN_ON(irq_entry->reg[type].events)) { err = -EALREADY; @@ -271,11 +319,22 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, /* temporarily disable to avoid IRQ-side locking */ os_del_epoll_fd(fd); } else { - irq_entry = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); - if (!irq_entry) { - err = -ENOMEM; - goto out_unlock; + struct irq_entry *new; + + /* don't restore interrupts */ + raw_spin_unlock(&irq_lock); + new = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); + if (!new) { + local_irq_restore(flags); + return -ENOMEM; + } + raw_spin_lock(&irq_lock); + irq_entry = get_irq_entry_by_fd(fd); + if (irq_entry) { + to_free = new; + goto already; } + irq_entry = new; irq_entry->fd = fd; list_add_tail(&irq_entry->list, &active_fds); maybe_sigio_broken(fd); @@ -294,12 +353,11 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, #endif WARN_ON(!update_irq_entry(irq_entry)); - spin_unlock_irqrestore(&irq_lock, flags); - - return 0; + err = 0; out_unlock: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); out: + kfree(to_free); return err; } @@ -313,19 +371,20 @@ void free_irq_by_fd(int fd) struct irq_entry *to_free; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); to_free = get_irq_entry_by_fd(fd); - free_irq_entry(to_free, true); - spin_unlock_irqrestore(&irq_lock, flags); + remove_irq_entry(to_free, true); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } EXPORT_SYMBOL(free_irq_by_fd); static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) { - struct irq_entry *entry; + struct irq_entry *entry, *to_free = NULL; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type i; @@ -341,12 +400,13 @@ static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) os_del_epoll_fd(entry->fd); reg->events = 0; - update_or_free_irq_entry(entry); + to_free = update_or_remove_irq_entry(entry); goto out; } } out: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } void deactivate_fd(int fd, int irqnum) @@ -357,7 +417,7 @@ void deactivate_fd(int fd, int irqnum) os_del_epoll_fd(fd); - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); entry = get_irq_entry_by_fd(fd); if (!entry) goto out; @@ -369,9 +429,10 @@ void deactivate_fd(int fd, int irqnum) entry->reg[i].events = 0; } - update_or_free_irq_entry(entry); + entry = update_or_remove_irq_entry(entry); out: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(entry); ignore_sigio_fd(fd); } @@ -501,7 +562,7 @@ void um_irqs_suspend(void) irqs_suspended = true; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type t; bool clear = true; @@ -534,7 +595,7 @@ void um_irqs_suspend(void) !__ignore_sigio_fd(entry->fd); } } - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); } void um_irqs_resume(void) @@ -543,30 +604,7 @@ void um_irqs_resume(void) unsigned long flags; - local_irq_save(flags); -#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT - /* - * We don't need to lock anything here since we're in resume - * and nothing else is running, but have disabled IRQs so we - * don't try anything else with the interrupt list from there. - */ - list_for_each_entry(entry, &active_fds, list) { - enum um_irq_type t; - - for (t = 0; t < NUM_IRQ_TYPES; t++) { - struct irq_reg *reg = &entry->reg[t]; - - if (reg->pending_on_resume) { - irq_enter(); - generic_handle_irq(reg->irq); - irq_exit(); - reg->pending_on_resume = false; - } - } - } -#endif - - spin_lock(&irq_lock); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { if (entry->suspended) { int err = os_set_fd_async(entry->fd); @@ -580,7 +618,7 @@ void um_irqs_resume(void) } } } - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); irqs_suspended = false; send_sigio_to_self(); @@ -591,7 +629,7 @@ static int normal_irq_set_wake(struct irq_data *d, unsigned int on) struct irq_entry *entry; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type t; @@ -606,7 +644,7 @@ static int normal_irq_set_wake(struct irq_data *d, unsigned int on) } } unlock: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); return 0; } #else @@ -652,115 +690,3 @@ void __init init_IRQ(void) /* Initialize EPOLL Loop */ os_setup_epoll(); } - -/* - * IRQ stack entry and exit: - * - * Unlike i386, UML doesn't receive IRQs on the normal kernel stack - * and switch over to the IRQ stack after some preparation. We use - * sigaltstack to receive signals on a separate stack from the start. - * These two functions make sure the rest of the kernel won't be too - * upset by being on a different stack. The IRQ stack has a - * thread_info structure at the bottom so that current et al continue - * to work. - * - * to_irq_stack copies the current task's thread_info to the IRQ stack - * thread_info and sets the tasks's stack to point to the IRQ stack. - * - * from_irq_stack copies the thread_info struct back (flags may have - * been modified) and resets the task's stack pointer. - * - * Tricky bits - - * - * What happens when two signals race each other? UML doesn't block - * signals with sigprocmask, SA_DEFER, or sa_mask, so a second signal - * could arrive while a previous one is still setting up the - * thread_info. - * - * There are three cases - - * The first interrupt on the stack - sets up the thread_info and - * handles the interrupt - * A nested interrupt interrupting the copying of the thread_info - - * can't handle the interrupt, as the stack is in an unknown state - * A nested interrupt not interrupting the copying of the - * thread_info - doesn't do any setup, just handles the interrupt - * - * The first job is to figure out whether we interrupted stack setup. - * This is done by xchging the signal mask with thread_info->pending. - * If the value that comes back is zero, then there is no setup in - * progress, and the interrupt can be handled. If the value is - * non-zero, then there is stack setup in progress. In order to have - * the interrupt handled, we leave our signal in the mask, and it will - * be handled by the upper handler after it has set up the stack. - * - * Next is to figure out whether we are the outer handler or a nested - * one. As part of setting up the stack, thread_info->real_thread is - * set to non-NULL (and is reset to NULL on exit). This is the - * nesting indicator. If it is non-NULL, then the stack is already - * set up and the handler can run. - */ - -static unsigned long pending_mask; - -unsigned long to_irq_stack(unsigned long *mask_out) -{ - struct thread_info *ti; - unsigned long mask, old; - int nested; - - mask = xchg(&pending_mask, *mask_out); - if (mask != 0) { - /* - * If any interrupts come in at this point, we want to - * make sure that their bits aren't lost by our - * putting our bit in. So, this loop accumulates bits - * until xchg returns the same value that we put in. - * When that happens, there were no new interrupts, - * and pending_mask contains a bit for each interrupt - * that came in. - */ - old = *mask_out; - do { - old |= mask; - mask = xchg(&pending_mask, old); - } while (mask != old); - return 1; - } - - ti = current_thread_info(); - nested = (ti->real_thread != NULL); - if (!nested) { - struct task_struct *task; - struct thread_info *tti; - - task = cpu_tasks[ti->cpu].task; - tti = task_thread_info(task); - - *ti = *tti; - ti->real_thread = tti; - task->stack = ti; - } - - mask = xchg(&pending_mask, 0); - *mask_out |= mask | nested; - return 0; -} - -unsigned long from_irq_stack(int nested) -{ - struct thread_info *ti, *to; - unsigned long mask; - - ti = current_thread_info(); - - pending_mask = 1; - - to = ti->real_thread; - current->stack = to; - ti->real_thread = NULL; - *to = *ti; - - mask = xchg(&pending_mask, 0); - return mask & ~1; -} - diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c index 427dd5a61a38..419021175272 100644 --- a/arch/um/kernel/kmsg_dump.c +++ b/arch/um/kernel/kmsg_dump.c @@ -8,7 +8,7 @@ #include <os.h> static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) + struct kmsg_dump_detail *detail) { static struct kmsg_dump_iter iter; static DEFINE_SPINLOCK(lock); @@ -57,7 +57,7 @@ static struct kmsg_dumper kmsg_dumper = { .dump = kmsg_dumper_stdout }; -int __init kmsg_dumper_stdout_init(void) +static int __init kmsg_dumper_stdout_init(void) { return kmsg_dump_register(&kmsg_dumper); } diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c index 3a85bde3e173..f2fb77da08cf 100644 --- a/arch/um/kernel/ksyms.c +++ b/arch/um/kernel/ksyms.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL(os_shutdown_socket); EXPORT_SYMBOL(os_create_unix_socket); EXPORT_SYMBOL(os_connect_socket); EXPORT_SYMBOL(os_accept_connection); -EXPORT_SYMBOL(os_rcv_fd); +EXPORT_SYMBOL(os_rcv_fd_msg); EXPORT_SYMBOL(run_helper); EXPORT_SYMBOL(os_major); EXPORT_SYMBOL(os_minor); diff --git a/arch/um/kernel/load_file.c b/arch/um/kernel/load_file.c index 5cecd0e291fb..cb9d178ab7d8 100644 --- a/arch/um/kernel/load_file.c +++ b/arch/um/kernel/load_file.c @@ -48,9 +48,7 @@ void *uml_load_file(const char *filename, unsigned long long *size) return NULL; } - area = memblock_alloc(*size, SMP_CACHE_BYTES); - if (!area) - panic("%s: Failed to allocate %llu bytes\n", __func__, *size); + area = memblock_alloc_or_panic(*size, SMP_CACHE_BYTES); if (__uml_load_file(filename, area, *size)) { memblock_free(area, *size); diff --git a/arch/um/kernel/maccess.c b/arch/um/kernel/maccess.c deleted file mode 100644 index 8ccd56813f68..000000000000 --- a/arch/um/kernel/maccess.c +++ /dev/null @@ -1,19 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2013 Richard Weinberger <richrd@nod.at> - */ - -#include <linux/uaccess.h> -#include <linux/kernel.h> -#include <os.h> - -bool copy_from_kernel_nofault_allowed(const void *src, size_t size) -{ - void *psrc = (void *)rounddown((unsigned long)src, PAGE_SIZE); - - if ((unsigned long)src < PAGE_SIZE || size <= 0) - return false; - if (os_mincore(psrc, size + src - psrc) <= 0) - return false; - return true; -} diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 38d5a71a579b..76bec7de81b5 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -6,18 +6,20 @@ #include <linux/stddef.h> #include <linux/module.h> #include <linux/memblock.h> -#include <linux/highmem.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> -#include <asm/fixmap.h> +#include <linux/init.h> +#include <asm/sections.h> #include <asm/page.h> +#include <asm/pgalloc.h> #include <as-layout.h> #include <init.h> #include <kern.h> #include <kern_util.h> #include <mem_user.h> #include <os.h> +#include <um_malloc.h> #include <linux/sched/task.h> #ifdef CONFIG_KASAN @@ -49,14 +51,12 @@ EXPORT_SYMBOL(empty_zero_page); pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* Initialized at boot time, and readonly after that */ -unsigned long long highmem; -EXPORT_SYMBOL(highmem); int kmalloc_ok = 0; /* Used during early boot */ static unsigned long brk_end; -void __init mem_init(void) +void __init arch_mm_preinit(void) { /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); @@ -68,14 +68,16 @@ void __init mem_init(void) map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); memblock_free((void *)brk_end, uml_reserved - brk_end); uml_reserved = brk_end; - - /* this will put all low memory onto the freelists */ - memblock_free_all(); - max_low_pfn = totalram_pages(); + min_low_pfn = PFN_UP(__pa(uml_reserved)); max_pfn = max_low_pfn; +} + +void __init mem_init(void) +{ kmalloc_ok = 1; } +#if IS_ENABLED(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) /* * Create a page table and place a pointer to it in a middle page * directory entry. @@ -97,7 +99,7 @@ static void __init one_page_table_init(pmd_t *pmd) static void __init one_md_table_init(pud_t *pud) { -#ifdef CONFIG_3_LEVEL_PGTABLES +#if CONFIG_PGTABLE_LEVELS > 2 pmd_t *pmd_table = (pmd_t *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); if (!pmd_table) panic("%s: Failed to allocate %lu bytes align=%lx\n", @@ -108,6 +110,19 @@ static void __init one_md_table_init(pud_t *pud) #endif } +static void __init one_ud_table_init(p4d_t *p4d) +{ +#if CONFIG_PGTABLE_LEVELS > 3 + pud_t *pud_table = (pud_t *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); + if (!pud_table) + panic("%s: Failed to allocate %lu bytes align=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE); + + set_p4d(p4d, __p4d(_KERNPG_TABLE + (unsigned long) __pa(pud_table))); + BUG_ON(pud_table != pud_offset(p4d, 0)); +#endif +} + static void __init fixrange_init(unsigned long start, unsigned long end, pgd_t *pgd_base) { @@ -125,6 +140,8 @@ static void __init fixrange_init(unsigned long start, unsigned long end, for ( ; (i < PTRS_PER_PGD) && (vaddr < end); pgd++, i++) { p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) + one_ud_table_init(p4d); pud = pud_offset(p4d, vaddr); if (pud_none(*pud)) one_md_table_init(pud); @@ -139,7 +156,6 @@ static void __init fixrange_init(unsigned long start, unsigned long end, static void __init fixaddr_user_init( void) { -#ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA long size = FIXADDR_USER_END - FIXADDR_USER_START; pte_t *pte; phys_t p; @@ -161,13 +177,12 @@ static void __init fixaddr_user_init( void) pte = virt_to_kpte(vaddr); pte_set_val(*pte, p, PAGE_READONLY); } -#endif } +#endif void __init paging_init(void) { unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - unsigned long vaddr; empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); @@ -178,14 +193,9 @@ void __init paging_init(void) max_zone_pfn[ZONE_NORMAL] = end_iomem >> PAGE_SHIFT; free_area_init(max_zone_pfn); - /* - * Fixed mappings, only the page table structure has to be - * created - mappings will be set by set_fixmap(): - */ - vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir); - +#if IS_ENABLED(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) fixaddr_user_init(); +#endif } /* @@ -201,14 +211,13 @@ void free_initmem(void) pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + pgd_t *pgd = __pgd_alloc(mm, 0); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + if (pgd) memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } + return pgd; } @@ -236,3 +245,11 @@ static const pgprot_t protection_map[16] = { [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED }; DECLARE_VM_GET_PAGE_PROT + +void mark_rodata_ro(void) +{ + unsigned long rodata_start = PFN_ALIGN(__start_rodata); + unsigned long rodata_end = PFN_ALIGN(__end_rodata); + + os_protect_memory((void *)rodata_start, rodata_end - rodata_start, 1, 0, 0); +} diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index 91485119ae67..af02b5f9911d 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -12,6 +12,7 @@ #include <as-layout.h> #include <init.h> #include <kern.h> +#include <kern_util.h> #include <mem_user.h> #include <os.h> @@ -21,23 +22,6 @@ static int physmem_fd = -1; unsigned long high_physmem; EXPORT_SYMBOL(high_physmem); -extern unsigned long long physmem_size; - -void __init mem_total_pages(unsigned long physmem, unsigned long iomem, - unsigned long highmem) -{ - unsigned long phys_pages, highmem_pages; - unsigned long iomem_pages, total_pages; - - phys_pages = physmem >> PAGE_SHIFT; - iomem_pages = iomem >> PAGE_SHIFT; - highmem_pages = highmem >> PAGE_SHIFT; - - total_pages = phys_pages + iomem_pages + highmem_pages; - - max_mapnr = total_pages; -} - void map_memory(unsigned long virt, unsigned long phys, unsigned long len, int r, int w, int x) { @@ -63,13 +47,12 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len, * @reserve_end: end address of the physical kernel memory. * @len: Length of total physical memory that should be mapped/made * available, in bytes. - * @highmem: Number of highmem bytes that should be mapped/made available. * - * Creates an unlinked temporary file of size (len + highmem) and memory maps + * Creates an unlinked temporary file of size (len) and memory maps * it on the last executable image address (uml_reserved). * * The offset is needed as the length of the total physical memory - * (len + highmem) includes the size of the memory used be the executable image, + * (len) includes the size of the memory used be the executable image, * but the mapped-to address is the last address of the executable image * (uml_reserved == end address of executable image). * @@ -77,24 +60,24 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len, * of all user space processes/kernel tasks. */ void __init setup_physmem(unsigned long start, unsigned long reserve_end, - unsigned long len, unsigned long long highmem) + unsigned long len) { unsigned long reserve = reserve_end - start; - long map_size = len - reserve; + unsigned long map_size = len - reserve; int err; - if(map_size <= 0) { + if (len <= reserve) { os_warn("Too few physical memory! Needed=%lu, given=%lu\n", reserve, len); exit(1); } - physmem_fd = create_mem_file(len + highmem); + physmem_fd = create_mem_file(len); err = os_map_memory((void *) reserve_end, physmem_fd, reserve, map_size, 1, 1, 1); if (err < 0) { - os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p " + os_warn("setup_physmem - mapping %lu bytes of memory at 0x%p " "failed - errno = %d\n", map_size, (void *) reserve_end, err); exit(1); @@ -106,9 +89,8 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, */ os_seek_file(physmem_fd, __pa(__syscall_stub_start)); os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); - os_fsync_file(physmem_fd); - memblock_add(__pa(start), len + highmem); + memblock_add(__pa(start), len); memblock_reserve(__pa(start), reserve); min_low_pfn = PFN_UP(__pa(reserve_end)); @@ -136,10 +118,6 @@ int phys_mapping(unsigned long phys, unsigned long long *offset_out) region = region->next; } } - else if (phys < __pa(end_iomem) + highmem) { - fd = physmem_fd; - *offset_out = phys - iomem_size; - } return fd; } @@ -148,6 +126,8 @@ EXPORT_SYMBOL(phys_mapping); static int __init uml_mem_setup(char *line, int *add) { char *retptr; + + *add = 0; physmem_size = memparse(line,&retptr); return 0; } @@ -161,8 +141,6 @@ __uml_setup("mem=", uml_mem_setup, " Example: mem=64M\n\n" ); -extern int __init parse_iomem(char *str, int *add); - __uml_setup("iomem=", parse_iomem, "iomem=<name>,<file>\n" " Configure <file> as an IO memory region named <name>.\n\n" diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index ab95648e93e1..0cd6fad3d908 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -15,6 +15,7 @@ #include <linux/proc_fs.h> #include <linux/ptrace.h> #include <linux/random.h> +#include <linux/cpu.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/debug.h> @@ -26,6 +27,8 @@ #include <linux/resume_user_mode.h> #include <asm/current.h> #include <asm/mmu_context.h> +#include <asm/switch_to.h> +#include <asm/exec.h> #include <linux/uaccess.h> #include <as-layout.h> #include <kern_util.h> @@ -40,24 +43,8 @@ * cares about its entry, so it's OK if another processor is modifying its * entry. */ -struct cpu_task cpu_tasks[NR_CPUS] = { [0 ... NR_CPUS - 1] = { -1, NULL } }; - -static inline int external_pid(void) -{ - /* FIXME: Need to look up userspace_pid by cpu */ - return userspace_pid[0]; -} - -int pid_to_processor_id(int pid) -{ - int i; - - for (i = 0; i < ncpus; i++) { - if (cpu_tasks[i].pid == pid) - return i; - } - return -1; -} +struct task_struct *cpu_tasks[NR_CPUS]; +EXPORT_SYMBOL(cpu_tasks); void free_stack(unsigned long stack, int order) { @@ -78,13 +65,10 @@ unsigned long alloc_stack(int order, int atomic) static inline void set_current(struct task_struct *task) { - cpu_tasks[task_thread_info(task)->cpu] = ((struct cpu_task) - { external_pid(), task }); + cpu_tasks[task_thread_info(task)->cpu] = task; } -extern void arch_switch_to(struct task_struct *to); - -void *__switch_to(struct task_struct *from, struct task_struct *to) +struct task_struct *__switch_to(struct task_struct *from, struct task_struct *to) { to->thread.prev_sched = from; set_current(to); @@ -119,28 +103,26 @@ int get_current_pid(void) */ void new_thread_handler(void) { - int (*fn)(void *), n; + int (*fn)(void *); void *arg; if (current->thread.prev_sched != NULL) schedule_tail(current->thread.prev_sched); current->thread.prev_sched = NULL; - fn = current->thread.request.u.thread.proc; - arg = current->thread.request.u.thread.arg; + fn = current->thread.request.thread.proc; + arg = current->thread.request.thread.arg; /* * callback returns only if the kernel thread execs a process */ - n = fn(arg); - userspace(¤t->thread.regs.regs, current_thread_info()->aux_fp_regs); + fn(arg); + userspace(¤t->thread.regs.regs); } /* Called magically, see new_thread_handler above */ -void fork_handler(void) +static void fork_handler(void) { - force_flush_all(); - schedule_tail(current->thread.prev_sched); /* @@ -152,7 +134,7 @@ void fork_handler(void) current->thread.prev_sched = NULL; - userspace(¤t->thread.regs.regs, current_thread_info()->aux_fp_regs); + userspace(¤t->thread.regs.regs); } int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) @@ -177,8 +159,8 @@ int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) arch_copy_thread(¤t->thread.arch, &p->thread.arch); } else { get_safe_registers(p->thread.regs.regs.gp, p->thread.regs.regs.fp); - p->thread.request.u.thread.proc = args->fn; - p->thread.request.u.thread.arg = args->fn_arg; + p->thread.request.thread.proc = args->fn; + p->thread.request.thread.arg = args->fn_arg; handler = new_thread_handler; } @@ -206,6 +188,21 @@ void initial_thread_cb(void (*proc)(void *), void *arg) kmalloc_ok = save_kmalloc_ok; } +int arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src) +{ + /* init_task is not dynamically sized (missing FPU state) */ + if (unlikely(src == &init_task)) { + memcpy(dst, src, sizeof(init_task)); + memset((void *)dst + sizeof(init_task), 0, + arch_task_struct_size - sizeof(init_task)); + } else { + memcpy(dst, src, arch_task_struct_size); + } + + return 0; +} + void um_idle_sleep(void) { if (time_travel_mode != TT_MODE_OFF) @@ -216,7 +213,6 @@ void um_idle_sleep(void) void arch_cpu_idle(void) { - cpu_tasks[current_thread_info()->cpu].pid = os_getpid(); um_idle_sleep(); } @@ -225,14 +221,6 @@ int __uml_cant_sleep(void) { /* Is in_interrupt() really needed? */ } -int user_context(unsigned long sp) -{ - unsigned long stack; - - stack = sp & (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER); - return stack != (unsigned long) current_thread_info(); -} - extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end; void do_uml_exitcalls(void) @@ -250,88 +238,11 @@ char *uml_strdup(const char *string) } EXPORT_SYMBOL(uml_strdup); -int copy_to_user_proc(void __user *to, void *from, int size) -{ - return copy_to_user(to, from, size); -} - int copy_from_user_proc(void *to, void __user *from, int size) { return copy_from_user(to, from, size); } -int clear_user_proc(void __user *buf, int size) -{ - return clear_user(buf, size); -} - -static atomic_t using_sysemu = ATOMIC_INIT(0); -int sysemu_supported; - -void set_using_sysemu(int value) -{ - if (value > sysemu_supported) - return; - atomic_set(&using_sysemu, value); -} - -int get_using_sysemu(void) -{ - return atomic_read(&using_sysemu); -} - -static int sysemu_proc_show(struct seq_file *m, void *v) -{ - seq_printf(m, "%d\n", get_using_sysemu()); - return 0; -} - -static int sysemu_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, sysemu_proc_show, NULL); -} - -static ssize_t sysemu_proc_write(struct file *file, const char __user *buf, - size_t count, loff_t *pos) -{ - char tmp[2]; - - if (copy_from_user(tmp, buf, 1)) - return -EFAULT; - - if (tmp[0] >= '0' && tmp[0] <= '2') - set_using_sysemu(tmp[0] - '0'); - /* We use the first char, but pretend to write everything */ - return count; -} - -static const struct proc_ops sysemu_proc_ops = { - .proc_open = sysemu_proc_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = single_release, - .proc_write = sysemu_proc_write, -}; - -int __init make_proc_sysemu(void) -{ - struct proc_dir_entry *ent; - if (!sysemu_supported) - return 0; - - ent = proc_create("sysemu", 0600, NULL, &sysemu_proc_ops); - - if (ent == NULL) - { - printk(KERN_WARNING "Failed to register /proc/sysemu\n"); - return 0; - } - - return 0; -} - -late_initcall(make_proc_sysemu); - int singlestepping(void) { return test_thread_flag(TIF_SINGLESTEP); @@ -384,11 +295,3 @@ unsigned long __get_wchan(struct task_struct *p) return 0; } - -int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu) -{ - int cpu = current_thread_info()->cpu; - - return save_i387_registers(userspace_pid[cpu], (unsigned long *) fpu); -} - diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index 6600a2782796..2124624b7817 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -35,9 +35,6 @@ void ptrace_disable(struct task_struct *child) user_disable_single_step(child); } -extern int peek_user(struct task_struct * child, long addr, long data); -extern int poke_user(struct task_struct * child, long addr, long data); - long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c index 48c0610d506e..680bce4bd8fa 100644 --- a/arch/um/kernel/reboot.c +++ b/arch/um/kernel/reboot.c @@ -9,6 +9,7 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/oom.h> +#include <linux/reboot.h> #include <kern_util.h> #include <os.h> #include <skas.h> @@ -28,7 +29,7 @@ static void kill_off_processes(void) t = find_lock_task_mm(p); if (!t) continue; - pid = t->mm->context.id.u.pid; + pid = t->mm->context.id.pid; task_unlock(t); os_kill_ptraced_process(pid, 1); } @@ -58,3 +59,18 @@ void machine_halt(void) { machine_power_off(); } + +static int sys_power_off_handler(struct sys_off_data *data) +{ + machine_power_off(); + return 0; +} + +static int register_power_off(void) +{ + register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_DEFAULT, + sys_power_off_handler, NULL); + return 0; +} +__initcall(register_power_off); diff --git a/arch/um/kernel/sigio.c b/arch/um/kernel/sigio.c index 5085a50c3b8c..4fc04742048a 100644 --- a/arch/um/kernel/sigio.c +++ b/arch/um/kernel/sigio.c @@ -8,32 +8,6 @@ #include <os.h> #include <sigio.h> -/* Protected by sigio_lock() called from write_sigio_workaround */ -static int sigio_irq_fd = -1; - -static irqreturn_t sigio_interrupt(int irq, void *data) -{ - char c; - - os_read_file(sigio_irq_fd, &c, sizeof(c)); - return IRQ_HANDLED; -} - -int write_sigio_irq(int fd) -{ - int err; - - err = um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt, - 0, "write sigio", NULL); - if (err < 0) { - printk(KERN_ERR "write_sigio_irq : um_request_irq failed, " - "err = %d\n", err); - return -1; - } - sigio_irq_fd = fd; - return 0; -} - /* These are called from os-Linux/sigio.c to protect its pollfds arrays. */ static DEFINE_MUTEX(sigio_mutex); diff --git a/arch/um/kernel/skas/.gitignore b/arch/um/kernel/skas/.gitignore new file mode 100644 index 000000000000..c3409ced0f38 --- /dev/null +++ b/arch/um/kernel/skas/.gitignore @@ -0,0 +1,2 @@ +stub_exe +stub_exe.dbg diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index f93972a25765..3384be42691f 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -3,15 +3,48 @@ # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) # -obj-y := clone.o mmu.o process.o syscall.o uaccess.o +obj-y := stub.o mmu.o process.o syscall.o uaccess.o \ + stub_exe_embed.o -# clone.o is in the stub, so it can't be built with profiling +# Stub executable + +stub_exe_objs-y := stub_exe.o + +stub_exe_objs := $(foreach F,$(stub_exe_objs-y),$(obj)/$F) + +# Object file containing the ELF executable +$(obj)/stub_exe_embed.o: $(src)/stub_exe_embed.S $(obj)/stub_exe + +$(obj)/stub_exe.dbg: $(stub_exe_objs) FORCE + $(call if_changed,stub_exe) + +$(obj)/stub_exe: OBJCOPYFLAGS := -S +$(obj)/stub_exe: $(obj)/stub_exe.dbg FORCE + $(call if_changed,objcopy) + +quiet_cmd_stub_exe = STUB_EXE $@ + cmd_stub_exe = $(CC) -nostdlib -o $@ \ + $(filter-out $(UM_GPROF_OPT) $(UM_GCOV_OPT),$(KBUILD_CFLAGS)) $(STUB_EXE_LDFLAGS) \ + $(filter %.o,$^) + +STUB_EXE_LDFLAGS = -Wl,-n -static + +targets += stub_exe.dbg stub_exe $(stub_exe_objs-y) + +# end + +# stub.o is in the stub, so it can't be built with profiling # GCC hardened also auto-enables -fpic, but we need %ebx so it can't work -> # disable it -CFLAGS_clone.o := $(CFLAGS_NO_HARDENING) -UNPROFILE_OBJS := clone.o +CFLAGS_stub.o := $(CFLAGS_NO_HARDENING) +CFLAGS_stub_exe.o := $(CFLAGS_NO_HARDENING) + +# Clang will call memset() from __builtin_alloca() when stack variable +# initialization is enabled, which is used in stub_exe.c. +CFLAGS_stub_exe.o += $(call cc-option, -ftrivial-auto-var-init=uninitialized) +UNPROFILE_OBJS := stub.o stub_exe.o KCOV_INSTRUMENT := n include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c deleted file mode 100644 index 62435187dda4..000000000000 --- a/arch/um/kernel/skas/clone.c +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include <signal.h> -#include <sched.h> -#include <asm/unistd.h> -#include <sys/time.h> -#include <as-layout.h> -#include <ptrace_user.h> -#include <stub-data.h> -#include <sysdep/stub.h> - -/* - * This is in a separate file because it needs to be compiled with any - * extraneous gcc flags (-pg, -fprofile-arcs, -ftest-coverage) disabled - * - * Use UM_KERN_PAGE_SIZE instead of PAGE_SIZE because that calls getpagesize - * on some systems. - */ - -void __attribute__ ((__section__ (".__syscall_stub"))) -stub_clone_handler(void) -{ - struct stub_data *data = get_stub_data(); - long err; - - err = stub_syscall2(__NR_clone, CLONE_PARENT | CLONE_FILES | SIGCHLD, - (unsigned long)data + - STUB_DATA_PAGES * UM_KERN_PAGE_SIZE / 2); - if (err) { - data->parent_err = err; - goto done; - } - - err = stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); - if (err) { - data->child_err = err; - goto done; - } - - remap_stack_and_trap(); - - done: - trap_myself(); -} diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 656fe16c9b63..0eb5a1d3ba70 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -10,14 +10,18 @@ #include <asm/pgalloc.h> #include <asm/sections.h> +#include <asm/mmu_context.h> #include <as-layout.h> #include <os.h> #include <skas.h> +#include <stub-data.h> + +/* Ensure the stub_data struct covers the allocated area */ +static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); int init_new_context(struct task_struct *task, struct mm_struct *mm) { - struct mm_context *from_mm = NULL; - struct mm_context *to_mm = &mm->context; + struct mm_id *new_id = &mm->context.id; unsigned long stack = 0; int ret = -ENOMEM; @@ -25,34 +29,24 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) if (stack == 0) goto out; - to_mm->id.stack = stack; - if (current->mm != NULL && current->mm != &init_mm) - from_mm = ¤t->mm->context; + new_id->stack = stack; block_signals_trace(); - if (from_mm) - to_mm->id.u.pid = copy_context_skas0(stack, - from_mm->id.u.pid); - else to_mm->id.u.pid = start_userspace(stack); + new_id->pid = start_userspace(stack); unblock_signals_trace(); - if (to_mm->id.u.pid < 0) { - ret = to_mm->id.u.pid; + if (new_id->pid < 0) { + ret = new_id->pid; goto out_free; } - ret = init_new_ldt(to_mm, from_mm); - if (ret < 0) { - printk(KERN_ERR "init_new_context_skas - init_ldt" - " failed, errno = %d\n", ret); - goto out_free; - } + /* Ensure the new MM is clean and nothing unwanted is mapped */ + unmap(new_id, 0, STUB_START); return 0; out_free: - if (to_mm->id.stack != 0) - free_pages(to_mm->id.stack, ilog2(STUB_DATA_PAGES)); + free_pages(new_id->stack, ilog2(STUB_DATA_PAGES)); out: return ret; } @@ -67,13 +61,12 @@ void destroy_context(struct mm_struct *mm) * whole UML suddenly dying. Also, cover negative and * 1 cases, since they shouldn't happen either. */ - if (mmu->id.u.pid < 2) { + if (mmu->id.pid < 2) { printk(KERN_ERR "corrupt mm_context - pid = %d\n", - mmu->id.u.pid); + mmu->id.pid); return; } - os_kill_ptraced_process(mmu->id.u.pid, 1); + os_kill_ptraced_process(mmu->id.pid, 1); free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES)); - free_ldt(mmu); } diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c index f2ac134c9752..05dcdc057af9 100644 --- a/arch/um/kernel/skas/process.c +++ b/arch/um/kernel/skas/process.c @@ -8,22 +8,19 @@ #include <linux/sched/task_stack.h> #include <linux/sched/task.h> +#include <asm/tlbflush.h> + #include <as-layout.h> #include <kern.h> #include <os.h> #include <skas.h> +#include <kern_util.h> extern void start_kernel(void); static int __init start_kernel_proc(void *unused) { - int pid; - block_signals_trace(); - pid = os_getpid(); - - cpu_tasks[0].pid = pid; - cpu_tasks[0].task = current; start_kernel(); return 0; @@ -31,7 +28,7 @@ static int __init start_kernel_proc(void *unused) extern int userspace_pid[]; -extern char cpu0_irqstack[]; +static char cpu0_irqstack[THREAD_SIZE] __aligned(THREAD_SIZE); int __init start_uml(void) { @@ -40,8 +37,8 @@ int __init start_uml(void) init_new_thread_signals(); - init_task.thread.request.u.thread.proc = start_kernel_proc; - init_task.thread.request.u.thread.arg = NULL; + init_task.thread.request.thread.proc = start_kernel_proc; + init_task.thread.request.thread.arg = NULL; return start_idle_thread(task_stack_page(&init_task), &init_task.thread.switch_buf); } @@ -53,3 +50,19 @@ unsigned long current_stub_stack(void) return current->mm->context.id.stack; } + +struct mm_id *current_mm_id(void) +{ + if (current->mm == NULL) + return NULL; + + return ¤t->mm->context.id; +} + +void current_mm_sync(void) +{ + if (current->mm == NULL) + return; + + um_tlb_sync(current->mm); +} diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c new file mode 100644 index 000000000000..796fc266d3bb --- /dev/null +++ b/arch/um/kernel/skas/stub.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> + */ + +#include <sysdep/stub.h> + +static __always_inline int syscall_handler(struct stub_data *d) +{ + int i; + unsigned long res; + + for (i = 0; i < d->syscall_data_len; i++) { + struct stub_syscall *sc = &d->syscall_data[i]; + + switch (sc->syscall) { + case STUB_SYSCALL_MMAP: + res = stub_syscall6(STUB_MMAP_NR, + sc->mem.addr, sc->mem.length, + sc->mem.prot, + MAP_SHARED | MAP_FIXED, + sc->mem.fd, sc->mem.offset); + if (res != sc->mem.addr) { + d->err = res; + d->syscall_data_len = i; + return -1; + } + break; + case STUB_SYSCALL_MUNMAP: + res = stub_syscall2(__NR_munmap, + sc->mem.addr, sc->mem.length); + if (res) { + d->err = res; + d->syscall_data_len = i; + return -1; + } + break; + default: + d->err = -95; /* EOPNOTSUPP */ + d->syscall_data_len = i; + return -1; + } + } + + d->err = 0; + d->syscall_data_len = 0; + + return 0; +} + +void __section(".__syscall_stub") +stub_syscall_handler(void) +{ + struct stub_data *d = get_stub_data(); + + syscall_handler(d); + + trap_myself(); +} diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c new file mode 100644 index 000000000000..23c99b285e82 --- /dev/null +++ b/arch/um/kernel/skas/stub_exe.c @@ -0,0 +1,95 @@ +#include <sys/ptrace.h> +#include <sys/prctl.h> +#include <asm/unistd.h> +#include <sysdep/stub.h> +#include <stub-data.h> + +void _start(void); + +noinline static void real_init(void) +{ + struct stub_init_data init_data; + unsigned long res; + struct { + void *ss_sp; + int ss_flags; + size_t ss_size; + } stack = { + .ss_size = STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, + }; + struct { + void *sa_handler_; + unsigned long sa_flags; + void *sa_restorer; + unsigned long long sa_mask; + } sa = { + /* Need to set SA_RESTORER (but the handler never returns) */ + .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000, + /* no need to mask any signals */ + .sa_mask = 0, + }; + + /* set a nice name */ + stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace"); + + /* Make sure this process dies if the kernel dies */ + stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL); + + /* read information from STDIN and close it */ + res = stub_syscall3(__NR_read, 0, + (unsigned long)&init_data, sizeof(init_data)); + if (res != sizeof(init_data)) + stub_syscall1(__NR_exit, 10); + + stub_syscall1(__NR_close, 0); + + /* map stub code + data */ + res = stub_syscall6(STUB_MMAP_NR, + init_data.stub_start, UM_KERN_PAGE_SIZE, + PROT_READ | PROT_EXEC, MAP_FIXED | MAP_SHARED, + init_data.stub_code_fd, init_data.stub_code_offset); + if (res != init_data.stub_start) + stub_syscall1(__NR_exit, 11); + + res = stub_syscall6(STUB_MMAP_NR, + init_data.stub_start + UM_KERN_PAGE_SIZE, + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, + PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, + init_data.stub_data_fd, init_data.stub_data_offset); + if (res != init_data.stub_start + UM_KERN_PAGE_SIZE) + stub_syscall1(__NR_exit, 12); + + /* setup signal stack inside stub data */ + stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE; + stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0); + + /* register SIGSEGV handler */ + sa.sa_handler_ = (void *) init_data.segv_handler; + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0, + sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 13); + + stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); + + stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); + + stub_syscall1(__NR_exit, 14); + + __builtin_unreachable(); +} + +__attribute__((naked)) void _start(void) +{ + /* + * Since the stack after exec() starts at the top-most address, + * but that's exactly where we also want to map the stub data + * and code, this must: + * - push the stack by 1 code and STUB_DATA_PAGES data pages + * - call real_init() + * This way, real_init() can use the stack normally, while the + * original stack further down (higher address) will become + * inaccessible after the mmap() calls above. + */ + stub_start(real_init); +} diff --git a/arch/um/kernel/skas/stub_exe_embed.S b/arch/um/kernel/skas/stub_exe_embed.S new file mode 100644 index 000000000000..6d8914fbe8f1 --- /dev/null +++ b/arch/um/kernel/skas/stub_exe_embed.S @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/init.h> +#include <linux/linkage.h> + +__INITDATA + +SYM_DATA_START(stub_exe_start) + .incbin "arch/um/kernel/skas/stub_exe" +SYM_DATA_END_LABEL(stub_exe_start, SYM_L_GLOBAL, stub_exe_end) + +__FINIT diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 9ee19e566da3..a5beaea2967e 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -12,23 +12,13 @@ #include <sysdep/syscalls.h> #include <linux/time-internal.h> #include <asm/unistd.h> +#include <asm/delay.h> void handle_syscall(struct uml_pt_regs *r) { struct pt_regs *regs = container_of(r, struct pt_regs, regs); int syscall; - /* - * If we have infinite CPU resources, then make every syscall also a - * preemption point, since we don't have any other preemption in this - * case, and kernel threads would basically never run until userspace - * went to sleep, even if said userspace interacts with the kernel in - * various ways. - */ - if (time_travel_mode == TT_MODE_INFCPU || - time_travel_mode == TT_MODE_EXTERNAL) - schedule(); - /* Initialize the syscall number and default return value. */ UPT_SYSCALL_NR(r) = PT_SYSCALL_NR(r->gp); PT_REGS_SET_SYSCALL_RETURN(regs, -ENOSYS); @@ -41,9 +31,36 @@ void handle_syscall(struct uml_pt_regs *r) goto out; syscall = UPT_SYSCALL_NR(r); - if (syscall >= 0 && syscall < __NR_syscalls) - PT_REGS_SET_SYSCALL_RETURN(regs, - EXECUTE_SYSCALL(syscall, regs)); + + /* + * If no time passes, then sched_yield may not actually yield, causing + * broken spinlock implementations in userspace (ASAN) to hang for long + * periods of time. + */ + if ((time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) && + syscall == __NR_sched_yield) + tt_extra_sched_jiffies += 1; + + if (syscall >= 0 && syscall < __NR_syscalls) { + unsigned long ret = EXECUTE_SYSCALL(syscall, regs); + + PT_REGS_SET_SYSCALL_RETURN(regs, ret); + + /* + * An error value here can be some form of -ERESTARTSYS + * and then we'd just loop. Make any error syscalls take + * some time, so that it won't just loop if something is + * not ready, and hopefully other things will make some + * progress. + */ + if (IS_ERR_VALUE(ret) && + (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL)) { + um_udelay(1); + schedule(); + } + } out: syscall_trace_leave(regs); diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 746715379f12..13ee5666668d 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -11,7 +11,6 @@ #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> -#include <asm/sysrq.h> #include <asm/stacktrace.h> #include <os.h> @@ -33,12 +32,6 @@ void show_stack(struct task_struct *task, unsigned long *stack, struct pt_regs *segv_regs = current->thread.segv_regs; int i; - if (!segv_regs && os_is_signal_stack()) { - pr_err("Received SIGSEGV in SIGSEGV handler," - " aborting stack trace!\n"); - return; - } - if (!stack) stack = get_stack_pointer(task, segv_regs); @@ -53,5 +46,5 @@ void show_stack(struct task_struct *task, unsigned long *stack, } printk("%sCall Trace:\n", loglvl); - dump_trace(current, &stackops, (void *)loglvl); + dump_trace(task ?: current, &stackops, (void *)loglvl); } diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 3e270da6b6f6..1394568c0210 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -19,17 +19,21 @@ #include <asm/param.h> #include <kern_util.h> #include <os.h> +#include <linux/delay.h> #include <linux/time-internal.h> #include <linux/um_timetravel.h> #include <shared/init.h> #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +#include <linux/sched/clock.h> + enum time_travel_mode time_travel_mode; EXPORT_SYMBOL_GPL(time_travel_mode); static bool time_travel_start_set; static unsigned long long time_travel_start; static unsigned long long time_travel_time; +static unsigned long long time_travel_shm_offset; static LIST_HEAD(time_travel_events); static LIST_HEAD(time_travel_irqs); static unsigned long long time_travel_timer_interval; @@ -39,8 +43,20 @@ static int time_travel_ext_fd = -1; static unsigned int time_travel_ext_waiting; static bool time_travel_ext_prev_request_valid; static unsigned long long time_travel_ext_prev_request; -static bool time_travel_ext_free_until_valid; -static unsigned long long time_travel_ext_free_until; +static unsigned long long *time_travel_ext_free_until; +static unsigned long long _time_travel_ext_free_until; +static u16 time_travel_shm_id; +static struct um_timetravel_schedshm *time_travel_shm; +static union um_timetravel_schedshm_client *time_travel_shm_client; + +unsigned long tt_extra_sched_jiffies; + +notrace unsigned long long sched_clock(void) +{ + return (unsigned long long)(jiffies - INITIAL_JIFFIES + + tt_extra_sched_jiffies) + * (NSEC_PER_SEC / HZ); +} static void time_travel_set_time(unsigned long long ns) { @@ -57,8 +73,52 @@ enum time_travel_message_handling { TTMH_IDLE, TTMH_POLL, TTMH_READ, + TTMH_READ_START_ACK, }; +static u64 bc_message; +int time_travel_should_print_bc_msg; + +void _time_travel_print_bc_msg(void) +{ + time_travel_should_print_bc_msg = 0; + printk(KERN_INFO "time-travel: received broadcast 0x%llx\n", bc_message); +} + +static void time_travel_setup_shm(int fd, u16 id) +{ + u32 len; + + time_travel_shm = os_mmap_rw_shared(fd, sizeof(*time_travel_shm)); + + if (!time_travel_shm) + goto out; + + len = time_travel_shm->len; + + if (time_travel_shm->version != UM_TIMETRAVEL_SCHEDSHM_VERSION || + len < struct_size(time_travel_shm, clients, id + 1)) { + os_unmap_memory(time_travel_shm, sizeof(*time_travel_shm)); + time_travel_shm = NULL; + goto out; + } + + time_travel_shm = os_mremap_rw_shared(time_travel_shm, + sizeof(*time_travel_shm), + len); + if (!time_travel_shm) + goto out; + + time_travel_shm_offset = time_travel_shm->current_time; + time_travel_shm_client = &time_travel_shm->clients[id]; + time_travel_shm_client->capa |= UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE; + time_travel_shm_id = id; + /* always look at that free_until from now on */ + time_travel_ext_free_until = &time_travel_shm->free_until; +out: + os_close_file(fd); +} + static void time_travel_handle_message(struct um_timetravel_msg *msg, enum time_travel_message_handling mode) { @@ -79,7 +139,20 @@ static void time_travel_handle_message(struct um_timetravel_msg *msg, } } - ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); + if (unlikely(mode == TTMH_READ_START_ACK)) { + int fd[UM_TIMETRAVEL_SHARED_MAX_FDS]; + + ret = os_rcv_fd_msg(time_travel_ext_fd, fd, + ARRAY_SIZE(fd), msg, sizeof(*msg)); + if (ret == sizeof(*msg)) { + time_travel_setup_shm(fd[UM_TIMETRAVEL_SHARED_MEMFD], + msg->time & UM_TIMETRAVEL_START_ACK_ID); + /* we don't use the logging for now */ + os_close_file(fd[UM_TIMETRAVEL_SHARED_LOGFD]); + } + } else { + ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); + } if (ret == 0) panic("time-travel external link is broken\n"); @@ -95,10 +168,24 @@ static void time_travel_handle_message(struct um_timetravel_msg *msg, return; case UM_TIMETRAVEL_RUN: time_travel_set_time(msg->time); + if (time_travel_shm) { + /* no request right now since we're running */ + time_travel_shm_client->flags &= + ~UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN; + /* no ack for shared memory RUN */ + return; + } break; case UM_TIMETRAVEL_FREE_UNTIL: - time_travel_ext_free_until_valid = true; - time_travel_ext_free_until = msg->time; + /* not supposed to get this with shm, but ignore it */ + if (time_travel_shm) + break; + time_travel_ext_free_until = &_time_travel_ext_free_until; + _time_travel_ext_free_until = msg->time; + break; + case UM_TIMETRAVEL_BROADCAST: + bc_message = msg->time; + time_travel_should_print_bc_msg = 1; break; } @@ -135,8 +222,15 @@ static u64 time_travel_ext_req(u32 op, u64 time) block_signals_hard(); os_write_file(time_travel_ext_fd, &msg, sizeof(msg)); + /* no ACK expected for WAIT in shared memory mode */ + if (msg.op == UM_TIMETRAVEL_WAIT && time_travel_shm) + goto done; + while (msg.op != UM_TIMETRAVEL_ACK) - time_travel_handle_message(&msg, TTMH_READ); + time_travel_handle_message(&msg, + op == UM_TIMETRAVEL_START ? + TTMH_READ_START_ACK : + TTMH_READ); if (msg.seq != mseq) panic("time-travel: ACK message has different seqno! op=%d, seq=%d != %d time=%lld\n", @@ -144,6 +238,7 @@ static u64 time_travel_ext_req(u32 op, u64 time) if (op == UM_TIMETRAVEL_GET) time_travel_set_time(msg.time); +done: unblock_signals_hard(); return msg.time; @@ -179,13 +274,33 @@ static void time_travel_ext_update_request(unsigned long long time) /* * if we're running and are allowed to run past the request * then we don't need to update it either + * + * Note for shm we ignore FREE_UNTIL messages and leave the pointer + * to shared memory, and for non-shm the offset is 0. */ - if (!time_travel_ext_waiting && time_travel_ext_free_until_valid && - time < time_travel_ext_free_until) + if (!time_travel_ext_waiting && time_travel_ext_free_until && + time < (*time_travel_ext_free_until - time_travel_shm_offset)) return; time_travel_ext_prev_request = time; time_travel_ext_prev_request_valid = true; + + if (time_travel_shm) { + union um_timetravel_schedshm_client *running; + + running = &time_travel_shm->clients[time_travel_shm->running_id]; + + if (running->capa & UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE) { + time_travel_shm_client->flags |= + UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN; + time += time_travel_shm_offset; + time_travel_shm_client->req_time = time; + if (time < time_travel_shm->free_until) + time_travel_shm->free_until = time; + return; + } + } + time_travel_ext_req(UM_TIMETRAVEL_REQUEST, time); } @@ -193,6 +308,14 @@ void __time_travel_propagate_time(void) { static unsigned long long last_propagated; + if (time_travel_shm) { + if (time_travel_shm->running_id != time_travel_shm_id) + panic("time-travel: setting time while not running\n"); + time_travel_shm->current_time = time_travel_time + + time_travel_shm_offset; + return; + } + if (last_propagated == time_travel_time) return; @@ -208,9 +331,12 @@ static bool time_travel_ext_request(unsigned long long time) * If we received an external sync point ("free until") then we * don't have to request/wait for anything until then, unless * we're already waiting. + * + * Note for shm we ignore FREE_UNTIL messages and leave the pointer + * to shared memory, and for non-shm the offset is 0. */ - if (!time_travel_ext_waiting && time_travel_ext_free_until_valid && - time < time_travel_ext_free_until) + if (!time_travel_ext_waiting && time_travel_ext_free_until && + time < (*time_travel_ext_free_until - time_travel_shm_offset)) return false; time_travel_ext_update_request(time); @@ -224,7 +350,8 @@ static void time_travel_ext_wait(bool idle) }; time_travel_ext_prev_request_valid = false; - time_travel_ext_free_until_valid = false; + if (!time_travel_shm) + time_travel_ext_free_until = NULL; time_travel_ext_waiting++; time_travel_ext_req(UM_TIMETRAVEL_WAIT, -1); @@ -247,7 +374,11 @@ static void time_travel_ext_wait(bool idle) static void time_travel_ext_get_time(void) { - time_travel_ext_req(UM_TIMETRAVEL_GET, -1); + if (time_travel_shm) + time_travel_set_time(time_travel_shm->current_time - + time_travel_shm_offset); + else + time_travel_ext_req(UM_TIMETRAVEL_GET, -1); } static void __time_travel_update_time(unsigned long long ns, bool idle) @@ -319,10 +450,15 @@ void time_travel_add_event_rel(struct time_travel_event *e, time_travel_add_event(e, time_travel_time + delay_ns); } -void time_travel_periodic_timer(struct time_travel_event *e) +static void time_travel_periodic_timer(struct time_travel_event *e) { time_travel_add_event(&time_travel_timer_event, time_travel_time + time_travel_timer_interval); + + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ + if (tt_extra_sched_jiffies > 0) + tt_extra_sched_jiffies -= 1; + deliver_alarm(); } @@ -474,6 +610,10 @@ EXPORT_SYMBOL_GPL(time_travel_add_irq_event); static void time_travel_oneshot_timer(struct time_travel_event *e) { + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ + if (tt_extra_sched_jiffies > 0) + tt_extra_sched_jiffies -= 1; + deliver_alarm(); } @@ -719,7 +859,7 @@ static irqreturn_t um_timer(int irq, void *dev) if (get_current()->mm != NULL) { /* userspace - relay signal, results in correct userspace timers */ - os_alarm_process(get_current()->mm->context.id.u.pid); + os_alarm_process(get_current()->mm->context.id.pid); } (*timer_clockevent.event_handler)(&timer_clockevent); @@ -812,7 +952,7 @@ unsigned long calibrate_delay_is_known(void) return 0; } -int setup_time_travel(char *str) +static int setup_time_travel(char *str) { if (strcmp(str, "=inf-cpu") == 0) { time_travel_mode = TT_MODE_INFCPU; @@ -862,7 +1002,7 @@ __uml_help(setup_time_travel, "devices using it, assuming the device has the right capabilities.\n" "The optional ID is a 64-bit integer that's sent to the central scheduler.\n"); -int setup_time_travel_start(char *str) +static int setup_time_travel_start(char *str) { int err; @@ -874,9 +1014,49 @@ int setup_time_travel_start(char *str) return 1; } -__setup("time-travel-start", setup_time_travel_start); +__setup("time-travel-start=", setup_time_travel_start); __uml_help(setup_time_travel_start, -"time-travel-start=<seconds>\n" +"time-travel-start=<nanoseconds>\n" "Configure the UML instance's wall clock to start at this value rather than\n" "the host's wall clock at the time of UML boot.\n"); +static struct kobject *bc_time_kobject; + +static ssize_t bc_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "0x%llx", bc_message); +} + +static ssize_t bc_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) +{ + int ret; + u64 user_bc_message; + + ret = kstrtou64(buf, 0, &user_bc_message); + if (ret) + return ret; + + bc_message = user_bc_message; + + time_travel_ext_req(UM_TIMETRAVEL_BROADCAST, bc_message); + pr_info("um: time: sent broadcast message: 0x%llx\n", bc_message); + return count; +} + +static struct kobj_attribute bc_attribute = __ATTR(bc-message, 0660, bc_show, bc_store); + +static int __init um_bc_start(void) +{ + if (time_travel_mode != TT_MODE_EXTERNAL) + return 0; + + bc_time_kobject = kobject_create_and_add("um-ext-time", kernel_kobj); + if (!bc_time_kobject) + return 0; + + if (sysfs_create_file(bc_time_kobject, &bc_attribute.attr)) + pr_debug("failed to create the bc file in /sys/kernel/um_time"); + + return 0; +} +late_initcall(um_bc_start); #endif diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 7d050ab0f78a..cf7e0d4407f2 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -8,241 +8,82 @@ #include <linux/sched/signal.h> #include <asm/tlbflush.h> +#include <asm/mmu_context.h> #include <as-layout.h> #include <mem_user.h> #include <os.h> #include <skas.h> #include <kern_util.h> -struct host_vm_change { - struct host_vm_op { - enum { NONE, MMAP, MUNMAP, MPROTECT } type; - union { - struct { - unsigned long addr; - unsigned long len; - unsigned int prot; - int fd; - __u64 offset; - } mmap; - struct { - unsigned long addr; - unsigned long len; - } munmap; - struct { - unsigned long addr; - unsigned long len; - unsigned int prot; - } mprotect; - } u; - } ops[1]; - int userspace; - int index; - struct mm_struct *mm; - void *data; - int force; -}; - -#define INIT_HVC(mm, force, userspace) \ - ((struct host_vm_change) \ - { .ops = { { .type = NONE } }, \ - .mm = mm, \ - .data = NULL, \ - .userspace = userspace, \ - .index = 0, \ - .force = force }) - -static void report_enomem(void) -{ - printk(KERN_ERR "UML ran out of memory on the host side! " - "This can happen due to a memory limitation or " - "vm.max_map_count has been reached.\n"); -} - -static int do_ops(struct host_vm_change *hvc, int end, - int finished) -{ - struct host_vm_op *op; - int i, ret = 0; +struct vm_ops { + struct mm_id *mm_idp; - for (i = 0; i < end && !ret; i++) { - op = &hvc->ops[i]; - switch (op->type) { - case MMAP: - if (hvc->userspace) - ret = map(&hvc->mm->context.id, op->u.mmap.addr, - op->u.mmap.len, op->u.mmap.prot, - op->u.mmap.fd, - op->u.mmap.offset, finished, - &hvc->data); - else - map_memory(op->u.mmap.addr, op->u.mmap.offset, - op->u.mmap.len, 1, 1, 1); - break; - case MUNMAP: - if (hvc->userspace) - ret = unmap(&hvc->mm->context.id, - op->u.munmap.addr, - op->u.munmap.len, finished, - &hvc->data); - else - ret = os_unmap_memory( - (void *) op->u.munmap.addr, - op->u.munmap.len); - - break; - case MPROTECT: - if (hvc->userspace) - ret = protect(&hvc->mm->context.id, - op->u.mprotect.addr, - op->u.mprotect.len, - op->u.mprotect.prot, - finished, &hvc->data); - else - ret = os_protect_memory( - (void *) op->u.mprotect.addr, - op->u.mprotect.len, - 1, 1, 1); - break; - default: - printk(KERN_ERR "Unknown op type %d in do_ops\n", - op->type); - BUG(); - break; - } - } - - if (ret == -ENOMEM) - report_enomem(); - - return ret; -} + int (*mmap)(struct mm_id *mm_idp, + unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset); + int (*unmap)(struct mm_id *mm_idp, + unsigned long virt, unsigned long len); +}; -static int add_mmap(unsigned long virt, unsigned long phys, unsigned long len, - unsigned int prot, struct host_vm_change *hvc) +static int kern_map(struct mm_id *mm_idp, + unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset) { - __u64 offset; - struct host_vm_op *last; - int fd = -1, ret = 0; - - if (hvc->userspace) - fd = phys_mapping(phys, &offset); - else - offset = phys; - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MMAP) && - (last->u.mmap.addr + last->u.mmap.len == virt) && - (last->u.mmap.prot == prot) && (last->u.mmap.fd == fd) && - (last->u.mmap.offset + last->u.mmap.len == offset)) { - last->u.mmap.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MMAP, - .u = { .mmap = { .addr = virt, - .len = len, - .prot = prot, - .fd = fd, - .offset = offset } - } }); - return ret; + /* TODO: Why is executable needed to be always set in the kernel? */ + return os_map_memory((void *)virt, phys_fd, offset, len, + prot & UM_PROT_READ, prot & UM_PROT_WRITE, + 1); } -static int add_munmap(unsigned long addr, unsigned long len, - struct host_vm_change *hvc) +static int kern_unmap(struct mm_id *mm_idp, + unsigned long virt, unsigned long len) { - struct host_vm_op *last; - int ret = 0; - - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MUNMAP) && - (last->u.munmap.addr + last->u.mmap.len == addr)) { - last->u.munmap.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MUNMAP, - .u = { .munmap = { .addr = addr, - .len = len } } }); - return ret; + return os_unmap_memory((void *)virt, len); } -static int add_mprotect(unsigned long addr, unsigned long len, - unsigned int prot, struct host_vm_change *hvc) +void report_enomem(void) { - struct host_vm_op *last; - int ret = 0; - - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MPROTECT) && - (last->u.mprotect.addr + last->u.mprotect.len == addr) && - (last->u.mprotect.prot == prot)) { - last->u.mprotect.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MPROTECT, - .u = { .mprotect = { .addr = addr, - .len = len, - .prot = prot } } }); - return ret; + printk(KERN_ERR "UML ran out of memory on the host side! " + "This can happen due to a memory limitation or " + "vm.max_map_count has been reached.\n"); } -#define ADD_ROUND(n, inc) (((n) + (inc)) & ~((inc) - 1)) - static inline int update_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pte_t *pte; - int r, w, x, prot, ret = 0; + int ret = 0; pte = pte_offset_kernel(pmd, addr); do { - r = pte_read(*pte); - w = pte_write(*pte); - x = pte_exec(*pte); - if (!pte_young(*pte)) { - r = 0; - w = 0; - } else if (!pte_dirty(*pte)) - w = 0; + if (!pte_needsync(*pte)) + continue; + + if (pte_present(*pte)) { + __u64 offset; + unsigned long phys = pte_val(*pte) & PAGE_MASK; + int fd = phys_mapping(phys, &offset); + int r, w, x, prot; + + r = pte_read(*pte); + w = pte_write(*pte); + x = pte_exec(*pte); + if (!pte_young(*pte)) { + r = 0; + w = 0; + } else if (!pte_dirty(*pte)) + w = 0; + + prot = (r ? UM_PROT_READ : 0) | + (w ? UM_PROT_WRITE : 0) | + (x ? UM_PROT_EXEC : 0); + + ret = ops->mmap(ops->mm_idp, addr, PAGE_SIZE, + prot, fd, offset); + } else + ret = ops->unmap(ops->mm_idp, addr, PAGE_SIZE); - prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | - (x ? UM_PROT_EXEC : 0)); - if (hvc->force || pte_newpage(*pte)) { - if (pte_present(*pte)) { - if (pte_newpage(*pte)) - ret = add_mmap(addr, pte_val(*pte) & PAGE_MASK, - PAGE_SIZE, prot, hvc); - } else - ret = add_munmap(addr, PAGE_SIZE, hvc); - } else if (pte_newprot(*pte)) - ret = add_mprotect(addr, PAGE_SIZE, prot, hvc); *pte = pte_mkuptodate(*pte); } while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret)); return ret; @@ -250,7 +91,7 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr, static inline int update_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pmd_t *pmd; unsigned long next; @@ -260,19 +101,20 @@ static inline int update_pmd_range(pud_t *pud, unsigned long addr, do { next = pmd_addr_end(addr, end); if (!pmd_present(*pmd)) { - if (hvc->force || pmd_newpage(*pmd)) { - ret = add_munmap(addr, next - addr, hvc); + if (pmd_needsync(*pmd)) { + ret = ops->unmap(ops->mm_idp, addr, + next - addr); pmd_mkuptodate(*pmd); } } - else ret = update_pte_range(pmd, addr, next, hvc); + else ret = update_pte_range(pmd, addr, next, ops); } while (pmd++, addr = next, ((addr < end) && !ret)); return ret; } static inline int update_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pud_t *pud; unsigned long next; @@ -282,19 +124,20 @@ static inline int update_pud_range(p4d_t *p4d, unsigned long addr, do { next = pud_addr_end(addr, end); if (!pud_present(*pud)) { - if (hvc->force || pud_newpage(*pud)) { - ret = add_munmap(addr, next - addr, hvc); + if (pud_needsync(*pud)) { + ret = ops->unmap(ops->mm_idp, addr, + next - addr); pud_mkuptodate(*pud); } } - else ret = update_pmd_range(pud, addr, next, hvc); + else ret = update_pmd_range(pud, addr, next, ops); } while (pud++, addr = next, ((addr < end) && !ret)); return ret; } static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { p4d_t *p4d; unsigned long next; @@ -304,227 +147,57 @@ static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, do { next = p4d_addr_end(addr, end); if (!p4d_present(*p4d)) { - if (hvc->force || p4d_newpage(*p4d)) { - ret = add_munmap(addr, next - addr, hvc); + if (p4d_needsync(*p4d)) { + ret = ops->unmap(ops->mm_idp, addr, + next - addr); p4d_mkuptodate(*p4d); } } else - ret = update_pud_range(p4d, addr, next, hvc); + ret = update_pud_range(p4d, addr, next, ops); } while (p4d++, addr = next, ((addr < end) && !ret)); return ret; } -static void fix_range_common(struct mm_struct *mm, unsigned long start_addr, - unsigned long end_addr, int force) +int um_tlb_sync(struct mm_struct *mm) { pgd_t *pgd; - struct host_vm_change hvc; - unsigned long addr = start_addr, next; - int ret = 0, userspace = 1; + struct vm_ops ops; + unsigned long addr = mm->context.sync_tlb_range_from, next; + int ret = 0; + + if (mm->context.sync_tlb_range_to == 0) + return 0; + + ops.mm_idp = &mm->context.id; + if (mm == &init_mm) { + ops.mmap = kern_map; + ops.unmap = kern_unmap; + } else { + ops.mmap = map; + ops.unmap = unmap; + } - hvc = INIT_HVC(mm, force, userspace); pgd = pgd_offset(mm, addr); do { - next = pgd_addr_end(addr, end_addr); + next = pgd_addr_end(addr, mm->context.sync_tlb_range_to); if (!pgd_present(*pgd)) { - if (force || pgd_newpage(*pgd)) { - ret = add_munmap(addr, next - addr, &hvc); + if (pgd_needsync(*pgd)) { + ret = ops.unmap(ops.mm_idp, addr, + next - addr); pgd_mkuptodate(*pgd); } } else - ret = update_p4d_range(pgd, addr, next, &hvc); - } while (pgd++, addr = next, ((addr < end_addr) && !ret)); - - if (!ret) - ret = do_ops(&hvc, hvc.index, 1); + ret = update_p4d_range(pgd, addr, next, &ops); + } while (pgd++, addr = next, + ((addr < mm->context.sync_tlb_range_to) && !ret)); - /* This is not an else because ret is modified above */ - if (ret) { - struct mm_id *mm_idp = ¤t->mm->context.id; - - printk(KERN_ERR "fix_range_common: failed, killing current " - "process: %d\n", task_tgid_vnr(current)); - mm_idp->kill = 1; - } -} - -static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) -{ - struct mm_struct *mm; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long addr, last; - int updated = 0, err = 0, force = 0, userspace = 0; - struct host_vm_change hvc; - - mm = &init_mm; - hvc = INIT_HVC(mm, force, userspace); - for (addr = start; addr < end;) { - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) { - last = ADD_ROUND(addr, PGDIR_SIZE); - if (last > end) - last = end; - if (pgd_newpage(*pgd)) { - updated = 1; - err = add_munmap(addr, last - addr, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) { - last = ADD_ROUND(addr, P4D_SIZE); - if (last > end) - last = end; - if (p4d_newpage(*p4d)) { - updated = 1; - err = add_munmap(addr, last - addr, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) { - last = ADD_ROUND(addr, PUD_SIZE); - if (last > end) - last = end; - if (pud_newpage(*pud)) { - updated = 1; - err = add_munmap(addr, last - addr, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - last = ADD_ROUND(addr, PMD_SIZE); - if (last > end) - last = end; - if (pmd_newpage(*pmd)) { - updated = 1; - err = add_munmap(addr, last - addr, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pte = pte_offset_kernel(pmd, addr); - if (!pte_present(*pte) || pte_newpage(*pte)) { - updated = 1; - err = add_munmap(addr, PAGE_SIZE, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - if (pte_present(*pte)) - err = add_mmap(addr, pte_val(*pte) & PAGE_MASK, - PAGE_SIZE, 0, &hvc); - } - else if (pte_newprot(*pte)) { - updated = 1; - err = add_mprotect(addr, PAGE_SIZE, 0, &hvc); - } - addr += PAGE_SIZE; - } - if (!err) - err = do_ops(&hvc, hvc.index, 1); - - if (err < 0) - panic("flush_tlb_kernel failed, errno = %d\n", err); - return updated; -} - -void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) -{ - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - struct mm_struct *mm = vma->vm_mm; - void *flush = NULL; - int r, w, x, prot, err = 0; - struct mm_id *mm_id; - - address &= PAGE_MASK; - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - goto kill; - - p4d = p4d_offset(pgd, address); - if (!p4d_present(*p4d)) - goto kill; - - pud = pud_offset(p4d, address); - if (!pud_present(*pud)) - goto kill; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - goto kill; - - pte = pte_offset_kernel(pmd, address); - - r = pte_read(*pte); - w = pte_write(*pte); - x = pte_exec(*pte); - if (!pte_young(*pte)) { - r = 0; - w = 0; - } else if (!pte_dirty(*pte)) { - w = 0; - } - - mm_id = &mm->context.id; - prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | - (x ? UM_PROT_EXEC : 0)); - if (pte_newpage(*pte)) { - if (pte_present(*pte)) { - unsigned long long offset; - int fd; - - fd = phys_mapping(pte_val(*pte) & PAGE_MASK, &offset); - err = map(mm_id, address, PAGE_SIZE, prot, fd, offset, - 1, &flush); - } - else err = unmap(mm_id, address, PAGE_SIZE, 1, &flush); - } - else if (pte_newprot(*pte)) - err = protect(mm_id, address, PAGE_SIZE, prot, 1, &flush); - - if (err) { - if (err == -ENOMEM) - report_enomem(); - - goto kill; - } - - *pte = pte_mkuptodate(*pte); + if (ret == -ENOMEM) + report_enomem(); - return; + mm->context.sync_tlb_range_from = 0; + mm->context.sync_tlb_range_to = 0; -kill: - printk(KERN_ERR "Failed to flush page for address 0x%lx\n", address); - force_sig(SIGKILL); + return ret; } void flush_tlb_all(void) @@ -539,66 +212,11 @@ void flush_tlb_all(void) flush_tlb_mm(current->mm); } -void flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ - flush_tlb_kernel_range_common(start, end); -} - -void flush_tlb_kernel_vm(void) -{ - flush_tlb_kernel_range_common(start_vm, end_vm); -} - -void __flush_tlb_one(unsigned long addr) -{ - flush_tlb_kernel_range_common(addr, addr + PAGE_SIZE); -} - -static void fix_range(struct mm_struct *mm, unsigned long start_addr, - unsigned long end_addr, int force) -{ - /* - * Don't bother flushing if this address space is about to be - * destroyed. - */ - if (atomic_read(&mm->mm_users) == 0) - return; - - fix_range_common(mm, start_addr, end_addr, force); -} - -void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - if (vma->vm_mm == NULL) - flush_tlb_kernel_range_common(start, end); - else fix_range(vma->vm_mm, start, end, 0); -} -EXPORT_SYMBOL(flush_tlb_range); - -void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - fix_range(mm, start, end, 0); -} - void flush_tlb_mm(struct mm_struct *mm) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) - fix_range(mm, vma->vm_start, vma->vm_end, 0); -} - -void force_flush_all(void) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - for_each_vma(vmi, vma) - fix_range(mm, vma->vm_start, vma->vm_end, 1); - mmap_read_unlock(mm); + um_tlb_mark_sync(mm, vma->vm_start, vma->vm_end); } diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 6d8ae86ae978..ce073150dc20 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -16,6 +16,7 @@ #include <kern_util.h> #include <os.h> #include <skas.h> +#include <arch.h> /* * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by @@ -113,7 +114,7 @@ good_area: #if 0 WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte))); #endif - flush_tlb_page(vma, address); + out: mmap_read_unlock(mm); out_nosemaphore: @@ -175,12 +176,14 @@ void fatal_sigsegv(void) * @sig: the signal number * @unused_si: the signal info struct; unused in this handler * @regs: the ptrace register information + * @mc: the mcontext of the signal * * The handler first extracts the faultinfo from the UML ptrace regs struct. * If the userfault did not happen in an UML userspace process, bad_segv is called. * Otherwise the signal did happen in a cloned userspace process, handle it. */ -void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { struct faultinfo * fi = UPT_FAULTINFO(regs); @@ -189,7 +192,7 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) bad_segv(*fi, UPT_IP(regs)); return; } - segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs); + segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc); } /* @@ -199,9 +202,8 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) * give us bad data! */ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, - struct uml_pt_regs *regs) + struct uml_pt_regs *regs, void *mc) { - jmp_buf *catcher; int si_code; int err; int is_write = FAULT_WRITE(fi); @@ -210,11 +212,33 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, if (!is_user && regs) current->thread.segv_regs = container_of(regs, struct pt_regs, regs); - if (!is_user && (address >= start_vm) && (address < end_vm)) { - flush_tlb_kernel_vm(); + if (!is_user && init_mm.context.sync_tlb_range_to) { + /* + * Kernel has pending updates from set_ptes that were not + * flushed yet. Syncing them should fix the pagefault (if not + * we'll get here again and panic). + */ + err = um_tlb_sync(&init_mm); + if (err == -ENOMEM) + report_enomem(); + if (err) + panic("Failed to sync kernel TLBs: %d", err); goto out; } else if (current->mm == NULL) { + if (current->pagefault_disabled) { + if (!mc) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault with pagefaults disabled but no mcontext"); + } + if (!current->thread.segv_continue) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault without recovery target"); + } + mc_set_rip(mc, current->thread.segv_continue); + current->thread.segv_continue = NULL; + goto out; + } show_regs(container_of(regs, struct pt_regs, regs)); panic("Segfault with no mm"); } @@ -237,15 +261,8 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, address = 0; } - catcher = current->thread.fault_catcher; if (!err) goto out; - else if (catcher != NULL) { - current->thread.fault_addr = (void *) address; - UML_LONGJMP(catcher, 1); - } - else if (current->thread.fault_addr != NULL) - panic("fault_addr set but no fault catcher"); else if (!is_user && arch_fixup(ip, regs)) goto out; @@ -273,7 +290,8 @@ out: return 0; } -void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) +void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs, + void *mc) { int code, err; if (!UPT_IS_USER(regs)) { @@ -301,15 +319,8 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) } } -void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs) -{ - if (current->thread.fault_catcher != NULL) - UML_LONGJMP(current->thread.fault_catcher, 1); - else - relay_signal(sig, si, regs); -} - -void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { do_IRQ(WINCH_IRQ, regs); } diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 7a9820797eae..d4b3b6742ec8 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -12,6 +12,7 @@ #include <linux/panic_notifier.h> #include <linux/seq_file.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/utsname.h> #include <linux/sched.h> #include <linux/sched/task.h> @@ -23,6 +24,7 @@ #include <asm/cpufeature.h> #include <asm/sections.h> #include <asm/setup.h> +#include <asm/text-patching.h> #include <as-layout.h> #include <arch.h> #include <init.h> @@ -64,9 +66,6 @@ struct cpuinfo_um boot_cpu_data = { EXPORT_SYMBOL(boot_cpu_data); -union thread_union cpu0_irqstack - __section(".data..init_irqstack") = - { .thread_info = INIT_THREAD_INFO(init_task) }; /* Changed in setup_arch, which is called in early boot */ static char host_info[(__NEW_UTS_LEN + 1) * 5]; @@ -80,7 +79,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "model name\t: UML\n"); seq_printf(m, "mode\t\t: skas\n"); seq_printf(m, "host\t\t: %s\n", host_info); - seq_printf(m, "fpu\t\t: %s\n", cpu_has(&boot_cpu_data, X86_FEATURE_FPU) ? "yes" : "no"); + seq_printf(m, "fpu\t\t: %s\n", str_yes_no(cpu_has(&boot_cpu_data, X86_FEATURE_FPU))); seq_printf(m, "flags\t\t:"); for (i = 0; i < 32*NCAPINTS; i++) if (cpu_has(&boot_cpu_data, i) && (x86_cap_flags[i] != NULL)) @@ -125,15 +124,12 @@ unsigned long uml_reserved; /* Also modified in mem_init */ unsigned long start_vm; unsigned long end_vm; -/* Set in uml_ncpus_setup */ -int ncpus = 1; - /* Set in early boot */ static int have_root __initdata; static int have_console __initdata; /* Set in uml_mem_setup and modified in linux_main */ -long long physmem_size = 64 * 1024 * 1024; +unsigned long long physmem_size = 64 * 1024 * 1024; EXPORT_SYMBOL(physmem_size); static const char *usage_string = @@ -169,19 +165,6 @@ __uml_setup("root=", uml_root_setup, " root=/dev/ubd5\n\n" ); -static int __init no_skas_debug_setup(char *line, int *add) -{ - os_warn("'debug' is not necessary to gdb UML in skas mode - run\n"); - os_warn("'gdb linux'\n"); - - return 0; -} - -__uml_setup("debug", no_skas_debug_setup, -"debug\n" -" this flag is not needed to run gdb on UML in skas mode\n\n" -); - static int __init uml_console_setup(char *line, int *add) { have_console = 1; @@ -259,6 +242,8 @@ static struct notifier_block panic_exit_notifier = { void uml_finishsetup(void) { + cpu_tasks[0] = &init_task; + atomic_notifier_chain_register(&panic_notifier_list, &panic_exit_notifier); @@ -280,7 +265,7 @@ EXPORT_SYMBOL(end_iomem); #define MIN_VMALLOC (32 * 1024 * 1024) -static void parse_host_cpu_flags(char *line) +static void __init parse_host_cpu_flags(char *line) { int i; for (i = 0; i < 32*NCAPINTS; i++) { @@ -288,7 +273,8 @@ static void parse_host_cpu_flags(char *line) set_cpu_cap(&boot_cpu_data, i); } } -static void parse_cache_line(char *line) + +static void __init parse_cache_line(char *line) { long res; char *to_parse = strstr(line, ":"); @@ -304,7 +290,24 @@ static void parse_cache_line(char *line) } } -int __init linux_main(int argc, char **argv) +static unsigned long __init get_top_address(char **envp) +{ + unsigned long top_addr = (unsigned long) &top_addr; + int i; + + /* The earliest variable should be after the program name in ELF */ + for (i = 0; envp[i]; i++) { + if ((unsigned long) envp[i] > top_addr) + top_addr = (unsigned long) envp[i]; + } + + top_addr &= ~(UM_KERN_PAGE_SIZE - 1); + top_addr += UM_KERN_PAGE_SIZE; + + return top_addr; +} + +int __init linux_main(int argc, char **argv, char **envp) { unsigned long avail, diff; unsigned long virtmem_size, max_physmem; @@ -326,20 +329,23 @@ int __init linux_main(int argc, char **argv) if (have_console == 0) add_arg(DEFAULT_COMMAND_LINE_CONSOLE); - host_task_size = os_get_top_address(); - /* reserve a few pages for the stubs (taking care of data alignment) */ - /* align the data portion */ - BUILD_BUG_ON(!is_power_of_2(STUB_DATA_PAGES)); - stub_start = (host_task_size - 1) & ~(STUB_DATA_PAGES * PAGE_SIZE - 1); + host_task_size = get_top_address(envp); + /* reserve a few pages for the stubs */ + stub_start = host_task_size - STUB_DATA_PAGES * PAGE_SIZE; /* another page for the code portion */ stub_start -= PAGE_SIZE; host_task_size = stub_start; + /* Limit TASK_SIZE to what is addressable by the page table */ + task_size = host_task_size; + if (task_size > (unsigned long long) PTRS_PER_PGD * PGDIR_SIZE) + task_size = PTRS_PER_PGD * PGDIR_SIZE; + /* * TASK_SIZE needs to be PGDIR_SIZE aligned or else exit_mmap craps * out */ - task_size = host_task_size & PGDIR_MASK; + task_size = task_size & PGDIR_MASK; /* OS sanity checks that need to happen before the kernel runs */ os_early_checks(); @@ -368,23 +374,18 @@ int __init linux_main(int argc, char **argv) setup_machinename(init_utsname()->machine); - highmem = 0; + physmem_size = (physmem_size + PAGE_SIZE - 1) & PAGE_MASK; iomem_size = (iomem_size + PAGE_SIZE - 1) & PAGE_MASK; - max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; - /* - * Zones have to begin on a 1 << MAX_PAGE_ORDER page boundary, - * so this makes sure that's true for highmem - */ - max_physmem &= ~((1 << (PAGE_SHIFT + MAX_PAGE_ORDER)) - 1); - if (physmem_size + iomem_size > max_physmem) { - highmem = physmem_size + iomem_size - max_physmem; - physmem_size -= highmem; + max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; + if (physmem_size > max_physmem) { + physmem_size = max_physmem; + os_info("Physical memory size shrunk to %llu bytes\n", + physmem_size); } high_physmem = uml_physmem + physmem_size; end_iomem = high_physmem + iomem_size; - high_memory = (void *) end_iomem; start_vm = VMALLOC_START; @@ -400,6 +401,8 @@ int __init linux_main(int argc, char **argv) os_info("Kernel virtual memory size shrunk to %lu bytes\n", virtmem_size); + arch_task_struct_size = sizeof(struct task_struct) + host_fp_size; + os_flush_stdout(); return start_uml(); @@ -414,9 +417,8 @@ void __init setup_arch(char **cmdline_p) { u8 rng_seed[32]; - stack_protections((unsigned long) &init_thread_info); - setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem); - mem_total_pages(physmem_size, iomem_size, highmem); + stack_protections((unsigned long) init_task.stack); + setup_physmem(uml_physmem, uml_reserved, physmem_size); uml_dtb_init(); read_initrd(); @@ -470,6 +472,11 @@ void *text_poke(void *addr, const void *opcode, size_t len) return memcpy(addr, opcode, len); } +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + return text_poke(addr, opcode, len); +} + void text_poke_sync(void) { } diff --git a/arch/um/kernel/um_arch.h b/arch/um/kernel/um_arch.h index 1e07fb7ee35e..46e731ab9dfc 100644 --- a/arch/um/kernel/um_arch.h +++ b/arch/um/kernel/um_arch.h @@ -11,4 +11,6 @@ extern void __init uml_dtb_init(void); static inline void uml_dtb_init(void) { } #endif +extern int __init read_initrd(void); + #endif diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 5c92d58a78e8..a409d4b66114 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -77,8 +77,6 @@ SECTIONS .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) - . = ALIGN(KERNEL_STACK_SIZE); - *(.data..init_irqstack) DATA_DATA *(.gnu.linkonce.d*) CONSTRUCTORS diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 544e0b344c75..049dfa5bc9c6 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -12,6 +12,8 @@ obj-y = execvp.o file.o helper.o irq.o main.o mem.o process.o \ CFLAGS_signal.o += -Wframe-larger-than=4096 +CFLAGS_main.o += -Wno-frame-larger-than + obj-$(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) += elf_aux.o USER_OBJS := $(user-objs-y) elf_aux.o execvp.o file.o helper.o irq.o \ diff --git a/arch/um/os-Linux/drivers/ethertap_kern.c b/arch/um/os-Linux/drivers/ethertap_kern.c index 3182e759d8de..5e5ee40680ce 100644 --- a/arch/um/os-Linux/drivers/ethertap_kern.c +++ b/arch/um/os-Linux/drivers/ethertap_kern.c @@ -63,7 +63,7 @@ const struct net_kern_info ethertap_kern_info = { .write = etap_write, }; -int ethertap_setup(char *str, char **mac_out, void *data) +static int ethertap_setup(char *str, char **mac_out, void *data) { struct ethertap_init *init = data; diff --git a/arch/um/os-Linux/drivers/tuntap_kern.c b/arch/um/os-Linux/drivers/tuntap_kern.c index adcb6717be6f..ff022d9cf0dd 100644 --- a/arch/um/os-Linux/drivers/tuntap_kern.c +++ b/arch/um/os-Linux/drivers/tuntap_kern.c @@ -53,7 +53,7 @@ const struct net_kern_info tuntap_kern_info = { .write = tuntap_write, }; -int tuntap_setup(char *str, char **mac_out, void *data) +static int tuntap_setup(char *str, char **mac_out, void *data) { struct tuntap_init *init = data; diff --git a/arch/um/os-Linux/elf_aux.c b/arch/um/os-Linux/elf_aux.c index 344ac403fb5d..0a0f91cf4d6d 100644 --- a/arch/um/os-Linux/elf_aux.c +++ b/arch/um/os-Linux/elf_aux.c @@ -13,6 +13,7 @@ #include <init.h> #include <elf_user.h> #include <mem_user.h> +#include "internal.h" typedef Elf32_auxv_t elf_auxv_t; diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index fc4450db59bd..a0d01c68ce3e 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -17,6 +17,7 @@ #include <sys/stat.h> #include <sys/sysmacros.h> #include <sys/un.h> +#include <sys/mman.h> #include <sys/types.h> #include <sys/eventfd.h> #include <poll.h> @@ -240,15 +241,19 @@ out: return err; } -void os_close_file(int fd) +int os_dup_file(int fd) { - close(fd); + int new_fd = dup(fd); + + if (new_fd < 0) + return -errno; + + return new_fd; } -int os_fsync_file(int fd) + +void os_close_file(int fd) { - if (fsync(fd) < 0) - return -errno; - return 0; + close(fd); } int os_seek_file(int fd, unsigned long long offset) @@ -502,44 +507,51 @@ int os_shutdown_socket(int fd, int r, int w) return 0; } -int os_rcv_fd(int fd, int *helper_pid_out) +/** + * os_rcv_fd_msg - receive message with (optional) FDs + * @fd: the FD to receive from + * @fds: the array for FDs to write to + * @n_fds: number of FDs to receive (@fds array size) + * @data: the message buffer + * @data_len: the size of the message to receive + * + * Receive a message with FDs. + * + * Returns: the size of the received message, or an error code + */ +ssize_t os_rcv_fd_msg(int fd, int *fds, unsigned int n_fds, + void *data, size_t data_len) { - int new, n; - char buf[CMSG_SPACE(sizeof(new))]; - struct msghdr msg; +#define MAX_RCV_FDS 2 + char buf[CMSG_SPACE(sizeof(*fds) * MAX_RCV_FDS)]; struct cmsghdr *cmsg; - struct iovec iov; - - msg.msg_name = NULL; - msg.msg_namelen = 0; - iov = ((struct iovec) { .iov_base = helper_pid_out, - .iov_len = sizeof(*helper_pid_out) }); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - msg.msg_control = buf; - msg.msg_controllen = sizeof(buf); - msg.msg_flags = 0; + struct iovec iov = { + .iov_base = data, + .iov_len = data_len, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = buf, + .msg_controllen = CMSG_SPACE(sizeof(*fds) * n_fds), + }; + int n; + + if (n_fds > MAX_RCV_FDS) + return -EINVAL; n = recvmsg(fd, &msg, 0); if (n < 0) return -errno; - else if (n != iov.iov_len) - *helper_pid_out = -1; cmsg = CMSG_FIRSTHDR(&msg); - if (cmsg == NULL) { - printk(UM_KERN_ERR "rcv_fd didn't receive anything, " - "error = %d\n", errno); - return -1; - } - if ((cmsg->cmsg_level != SOL_SOCKET) || - (cmsg->cmsg_type != SCM_RIGHTS)) { - printk(UM_KERN_ERR "rcv_fd didn't receive a descriptor\n"); - return -1; - } + if (!cmsg || + cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) + return n; - new = ((int *) CMSG_DATA(cmsg))[0]; - return new; + memcpy(fds, CMSG_DATA(cmsg), cmsg->cmsg_len); + return n; } int os_create_unix_socket(const char *file, int len, int close_on_exec) @@ -705,3 +717,25 @@ int os_poll(unsigned int n, const int *fds) return -EIO; } + +void *os_mmap_rw_shared(int fd, size_t size) +{ + void *res = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + + if (res == MAP_FAILED) + return NULL; + + return res; +} + +void *os_mremap_rw_shared(void *old_addr, size_t old_size, size_t new_size) +{ + void *res; + + res = mremap(old_addr, old_size, new_size, MREMAP_MAYMOVE, NULL); + + if (res == MAP_FAILED) + return NULL; + + return res; +} diff --git a/arch/um/os-Linux/helper.c b/arch/um/os-Linux/helper.c index 3cb8ac63be6e..89c2ad2a4e3a 100644 --- a/arch/um/os-Linux/helper.c +++ b/arch/um/os-Linux/helper.c @@ -8,6 +8,7 @@ #include <unistd.h> #include <errno.h> #include <sched.h> +#include <pthread.h> #include <linux/limits.h> #include <sys/socket.h> #include <sys/wait.h> @@ -121,6 +122,10 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, unsigned long stack, sp; int pid, status, err; + /* To share memory space, use os_run_helper_thread() instead. */ + if (flags & CLONE_VM) + return -EINVAL; + stack = alloc_stack(0, __uml_cant_sleep()); if (stack == 0) return -ENOMEM; @@ -167,3 +172,65 @@ int helper_wait(int pid) } else return 0; } + +struct os_helper_thread { + pthread_t handle; +}; + +int os_run_helper_thread(struct os_helper_thread **td_out, + void *(*routine)(void *), void *arg) +{ + struct os_helper_thread *td; + sigset_t sigset, oset; + int err, flags; + + flags = __uml_cant_sleep() ? UM_GFP_ATOMIC : UM_GFP_KERNEL; + td = uml_kmalloc(sizeof(*td), flags); + if (!td) + return -ENOMEM; + + sigfillset(&sigset); + if (sigprocmask(SIG_SETMASK, &sigset, &oset) < 0) { + err = -errno; + kfree(td); + return err; + } + + err = pthread_create(&td->handle, NULL, routine, arg); + + if (sigprocmask(SIG_SETMASK, &oset, NULL) < 0) + panic("Failed to restore the signal mask: %d", errno); + + if (err != 0) + kfree(td); + else + *td_out = td; + + return -err; +} + +void os_kill_helper_thread(struct os_helper_thread *td) +{ + pthread_cancel(td->handle); + pthread_join(td->handle, NULL); + kfree(td); +} + +void os_fix_helper_thread_signals(void) +{ + sigset_t sigset; + + sigemptyset(&sigset); + + sigaddset(&sigset, SIGWINCH); + sigaddset(&sigset, SIGPIPE); + sigaddset(&sigset, SIGPROF); + sigaddset(&sigset, SIGINT); + sigaddset(&sigset, SIGTERM); + sigaddset(&sigset, SIGCHLD); + sigaddset(&sigset, SIGALRM); + sigaddset(&sigset, SIGIO); + sigaddset(&sigset, SIGUSR1); + + pthread_sigmask(SIG_SETMASK, &sigset, NULL); +} diff --git a/arch/um/os-Linux/internal.h b/arch/um/os-Linux/internal.h new file mode 100644 index 000000000000..317fca190c2b --- /dev/null +++ b/arch/um/os-Linux/internal.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_OS_LINUX_INTERNAL_H +#define __UM_OS_LINUX_INTERNAL_H + +/* + * elf_aux.c + */ +void scan_elf_aux(char **envp); + +/* + * mem.c + */ +void check_tmpexec(void); + +/* + * skas/process.c + */ +void wait_stub_done(int pid); + +#endif /* __UM_OS_LINUX_INTERNAL_H */ diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c index c8a42ecbd7a2..3c63ce19e3bf 100644 --- a/arch/um/os-Linux/main.c +++ b/arch/um/os-Linux/main.c @@ -11,19 +11,19 @@ #include <signal.h> #include <string.h> #include <sys/resource.h> +#include <sys/personality.h> #include <as-layout.h> #include <init.h> #include <kern_util.h> #include <os.h> #include <um_malloc.h> +#include "internal.h" -#define PGD_BOUND (4 * 1024 * 1024) #define STACKSIZE (8 * 1024 * 1024) -#define THREAD_NAME_LEN (256) long elf_aux_hwcap; -static void set_stklim(void) +static void __init set_stklim(void) { struct rlimit lim; @@ -46,7 +46,7 @@ static void last_ditch_exit(int sig) exit(1); } -static void install_fatal_handler(int sig) +static void __init install_fatal_handler(int sig) { struct sigaction action; @@ -71,7 +71,7 @@ static void install_fatal_handler(int sig) #define UML_LIB_PATH ":" OS_LIB_PATH "/uml" -static void setup_env_path(void) +static void __init setup_env_path(void) { char *new_path = NULL; char *old_path = NULL; @@ -102,13 +102,26 @@ static void setup_env_path(void) } } -extern void scan_elf_aux( char **envp); - int __init main(int argc, char **argv, char **envp) { char **new_argv; int ret, i, err; + /* Disable randomization and re-exec if it was changed successfully */ + ret = personality(PER_LINUX | ADDR_NO_RANDOMIZE); + if (ret >= 0 && (ret & (PER_LINUX | ADDR_NO_RANDOMIZE)) != + (PER_LINUX | ADDR_NO_RANDOMIZE)) { + char buf[4096] = {}; + ssize_t ret; + + ret = readlink("/proc/self/exe", buf, sizeof(buf)); + if (ret < 0 || ret >= sizeof(buf)) { + perror("readlink failure"); + exit(1); + } + execve(buf, argv, envp); + } + set_stklim(); setup_env_path(); @@ -141,7 +154,7 @@ int __init main(int argc, char **argv, char **envp) #endif change_sig(SIGPIPE, 0); - ret = linux_main(argc, argv); + ret = linux_main(argc, argv, envp); /* * Disable SIGPROF - I have no idea why libc doesn't do this or turn @@ -183,6 +196,12 @@ int __init main(int argc, char **argv, char **envp) } extern void *__real_malloc(int); +extern void __real_free(void *); + +/* workaround for -Wmissing-prototypes warnings */ +void *__wrap_malloc(int size); +void *__wrap_calloc(int n, int size); +void __wrap_free(void *ptr); void *__wrap_malloc(int size) { @@ -215,10 +234,6 @@ void *__wrap_calloc(int n, int size) return ptr; } -extern void __real_free(void *); - -extern unsigned long high_physmem; - void __wrap_free(void *ptr) { unsigned long addr = (unsigned long) ptr; diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c index 8530b2e08604..72f302f4d197 100644 --- a/arch/um/os-Linux/mem.c +++ b/arch/um/os-Linux/mem.c @@ -15,7 +15,9 @@ #include <sys/vfs.h> #include <linux/magic.h> #include <init.h> +#include <kern_util.h> #include <os.h> +#include "internal.h" /* * kasan_map_memory - maps memory from @start with a size of @len. @@ -37,10 +39,22 @@ void kasan_map_memory(void *start, size_t len) strerror(errno)); exit(1); } + + if (madvise(start, len, MADV_DONTDUMP)) { + os_info("Couldn't set MAD_DONTDUMP on shadow memory: %s\n.", + strerror(errno)); + exit(1); + } + + if (madvise(start, len, MADV_DONTFORK)) { + os_info("Couldn't set MADV_DONTFORK on shadow memory: %s\n.", + strerror(errno)); + exit(1); + } } /* Set by make_tempfile() during early boot. */ -static char *tempdir = NULL; +char *tempdir = NULL; /* Check if dir is on tmpfs. Return 0 if yes, -1 if no or error. */ static int __init check_tmpfs(const char *dir) diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index e52dd37ddadc..184566edeee9 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -12,94 +12,18 @@ #include <fcntl.h> #include <sys/mman.h> #include <sys/ptrace.h> +#include <sys/prctl.h> #include <sys/wait.h> #include <asm/unistd.h> #include <init.h> #include <longjmp.h> #include <os.h> -#define ARBITRARY_ADDR -1 -#define FAILURE_PID -1 - -#define STAT_PATH_LEN sizeof("/proc/#######/stat\0") -#define COMM_SCANF "%*[^)])" - -unsigned long os_process_pc(int pid) -{ - char proc_stat[STAT_PATH_LEN], buf[256]; - unsigned long pc = ARBITRARY_ADDR; - int fd, err; - - sprintf(proc_stat, "/proc/%d/stat", pid); - fd = open(proc_stat, O_RDONLY, 0); - if (fd < 0) { - printk(UM_KERN_ERR "os_process_pc - couldn't open '%s', " - "errno = %d\n", proc_stat, errno); - goto out; - } - CATCH_EINTR(err = read(fd, buf, sizeof(buf))); - if (err < 0) { - printk(UM_KERN_ERR "os_process_pc - couldn't read '%s', " - "err = %d\n", proc_stat, errno); - goto out_close; - } - os_close_file(fd); - pc = ARBITRARY_ADDR; - if (sscanf(buf, "%*d " COMM_SCANF " %*c %*d %*d %*d %*d %*d %*d %*d " - "%*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d " - "%*d %*d %*d %*d %*d %lu", &pc) != 1) - printk(UM_KERN_ERR "os_process_pc - couldn't find pc in '%s'\n", - buf); - out_close: - close(fd); - out: - return pc; -} - -int os_process_parent(int pid) -{ - char stat[STAT_PATH_LEN]; - char data[256]; - int parent = FAILURE_PID, n, fd; - - if (pid == -1) - return parent; - - snprintf(stat, sizeof(stat), "/proc/%d/stat", pid); - fd = open(stat, O_RDONLY, 0); - if (fd < 0) { - printk(UM_KERN_ERR "Couldn't open '%s', errno = %d\n", stat, - errno); - return parent; - } - - CATCH_EINTR(n = read(fd, data, sizeof(data))); - close(fd); - - if (n < 0) { - printk(UM_KERN_ERR "Couldn't read '%s', errno = %d\n", stat, - errno); - return parent; - } - - parent = FAILURE_PID; - n = sscanf(data, "%*d " COMM_SCANF " %*c %d", &parent); - if (n != 1) - printk(UM_KERN_ERR "Failed to scan '%s'\n", data); - - return parent; -} - void os_alarm_process(int pid) { kill(pid, SIGALRM); } -void os_stop_process(int pid) -{ - kill(pid, SIGSTOP); -} - void os_kill_process(int pid, int reap_child) { kill(pid, SIGKILL); @@ -130,11 +54,6 @@ int os_getpid(void) return syscall(__NR_getpid); } -int os_getpgrp(void) -{ - return getpgrp(); -} - int os_map_memory(void *virt, int fd, unsigned long long off, unsigned long len, int r, int w, int x) { @@ -223,57 +142,6 @@ out: return ok; } -static int os_page_mincore(void *addr) -{ - char vec[2]; - int ret; - - ret = mincore(addr, UM_KERN_PAGE_SIZE, vec); - if (ret < 0) { - if (errno == ENOMEM || errno == EINVAL) - return 0; - else - return -errno; - } - - return vec[0] & 1; -} - -int os_mincore(void *addr, unsigned long len) -{ - char *vec; - int ret, i; - - if (len <= UM_KERN_PAGE_SIZE) - return os_page_mincore(addr); - - vec = calloc(1, (len + UM_KERN_PAGE_SIZE - 1) / UM_KERN_PAGE_SIZE); - if (!vec) - return -ENOMEM; - - ret = mincore(addr, UM_KERN_PAGE_SIZE, vec); - if (ret < 0) { - if (errno == ENOMEM || errno == EINVAL) - ret = 0; - else - ret = -errno; - - goto out; - } - - for (i = 0; i < ((len + UM_KERN_PAGE_SIZE - 1) / UM_KERN_PAGE_SIZE); i++) { - if (!(vec[i] & 1)) { - ret = 0; - goto out; - } - } - - ret = 1; -out: - free(vec); - return ret; -} - void init_new_thread_signals(void) { set_handler(SIGSEGV); @@ -285,3 +153,8 @@ void init_new_thread_signals(void) set_handler(SIGIO); signal(SIGWINCH, SIG_IGN); } + +void os_set_pdeathsig(void) +{ + prctl(PR_SET_PDEATHSIG, SIGKILL); +} diff --git a/arch/um/os-Linux/registers.c b/arch/um/os-Linux/registers.c index bd80b921add0..d7ca148807b2 100644 --- a/arch/um/os-Linux/registers.c +++ b/arch/um/os-Linux/registers.c @@ -10,11 +10,12 @@ #include <sysdep/ptrace.h> #include <sysdep/ptrace_user.h> #include <registers.h> +#include <stdlib.h> /* This is set once at boot time and not changed thereafter */ static unsigned long exec_regs[MAX_REG_NR]; -static unsigned long exec_fp_regs[FP_SIZE]; +static unsigned long *exec_fp_regs; int init_pid_registers(int pid) { @@ -24,7 +25,11 @@ int init_pid_registers(int pid) if (err < 0) return -errno; - arch_init_registers(pid); + err = arch_init_registers(pid); + if (err < 0) + return err; + + exec_fp_regs = malloc(host_fp_size); get_fp_registers(pid, exec_fp_regs); return 0; } @@ -34,5 +39,5 @@ void get_safe_registers(unsigned long *regs, unsigned long *fp_regs) memcpy(regs, exec_regs, sizeof(exec_regs)); if (fp_regs) - memcpy(fp_regs, exec_fp_regs, sizeof(exec_fp_regs)); + memcpy(fp_regs, exec_fp_regs, host_fp_size); } diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index 9e71794839e8..a05a6ecee756 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -11,6 +11,7 @@ #include <sched.h> #include <signal.h> #include <string.h> +#include <sys/epoll.h> #include <kern_util.h> #include <init.h> #include <os.h> @@ -21,183 +22,51 @@ * Protected by sigio_lock(), also used by sigio_cleanup, which is an * exitcall. */ -static int write_sigio_pid = -1; -static unsigned long write_sigio_stack; +static struct os_helper_thread *write_sigio_td; -/* - * These arrays are initialized before the sigio thread is started, and - * the descriptors closed after it is killed. So, it can't see them change. - * On the UML side, they are changed under the sigio_lock. - */ -#define SIGIO_FDS_INIT {-1, -1} - -static int write_sigio_fds[2] = SIGIO_FDS_INIT; -static int sigio_private[2] = SIGIO_FDS_INIT; +static int epollfd = -1; -struct pollfds { - struct pollfd *poll; - int size; - int used; -}; +#define MAX_EPOLL_EVENTS 64 -/* - * Protected by sigio_lock(). Used by the sigio thread, but the UML thread - * synchronizes with it. - */ -static struct pollfds current_poll; -static struct pollfds next_poll; -static struct pollfds all_sigio_fds; +static struct epoll_event epoll_events[MAX_EPOLL_EVENTS]; -static int write_sigio_thread(void *unused) +static void *write_sigio_thread(void *unused) { - struct pollfds *fds, tmp; - struct pollfd *p; - int i, n, respond_fd; - char c; + int pid = getpid(); + int r; + + os_fix_helper_thread_signals(); - os_fix_helper_signals(); - fds = ¤t_poll; while (1) { - n = poll(fds->poll, fds->used, -1); - if (n < 0) { + r = epoll_wait(epollfd, epoll_events, MAX_EPOLL_EVENTS, -1); + if (r < 0) { if (errno == EINTR) continue; - printk(UM_KERN_ERR "write_sigio_thread : poll returned " - "%d, errno = %d\n", n, errno); - } - for (i = 0; i < fds->used; i++) { - p = &fds->poll[i]; - if (p->revents == 0) - continue; - if (p->fd == sigio_private[1]) { - CATCH_EINTR(n = read(sigio_private[1], &c, - sizeof(c))); - if (n != sizeof(c)) - printk(UM_KERN_ERR - "write_sigio_thread : " - "read on socket failed, " - "err = %d\n", errno); - tmp = current_poll; - current_poll = next_poll; - next_poll = tmp; - respond_fd = sigio_private[1]; - } - else { - respond_fd = write_sigio_fds[1]; - fds->used--; - memmove(&fds->poll[i], &fds->poll[i + 1], - (fds->used - i) * sizeof(*fds->poll)); - } - - CATCH_EINTR(n = write(respond_fd, &c, sizeof(c))); - if (n != sizeof(c)) - printk(UM_KERN_ERR "write_sigio_thread : " - "write on socket failed, err = %d\n", - errno); + printk(UM_KERN_ERR "%s: epoll_wait failed, errno = %d\n", + __func__, errno); } - } - - return 0; -} - -static int need_poll(struct pollfds *polls, int n) -{ - struct pollfd *new; - - if (n <= polls->size) - return 0; - - new = uml_kmalloc(n * sizeof(struct pollfd), UM_GFP_ATOMIC); - if (new == NULL) { - printk(UM_KERN_ERR "need_poll : failed to allocate new " - "pollfds\n"); - return -ENOMEM; - } - - memcpy(new, polls->poll, polls->used * sizeof(struct pollfd)); - kfree(polls->poll); - - polls->poll = new; - polls->size = n; - return 0; -} - -/* - * Must be called with sigio_lock held, because it's needed by the marked - * critical section. - */ -static void update_thread(void) -{ - unsigned long flags; - int n; - char c; - flags = um_set_signals_trace(0); - CATCH_EINTR(n = write(sigio_private[0], &c, sizeof(c))); - if (n != sizeof(c)) { - printk(UM_KERN_ERR "update_thread : write failed, err = %d\n", - errno); - goto fail; + CATCH_EINTR(r = tgkill(pid, pid, SIGIO)); + if (r < 0) + printk(UM_KERN_ERR "%s: tgkill failed, errno = %d\n", + __func__, errno); } - CATCH_EINTR(n = read(sigio_private[0], &c, sizeof(c))); - if (n != sizeof(c)) { - printk(UM_KERN_ERR "update_thread : read failed, err = %d\n", - errno); - goto fail; - } - - um_set_signals_trace(flags); - return; - fail: - /* Critical section start */ - if (write_sigio_pid != -1) { - os_kill_process(write_sigio_pid, 1); - free_stack(write_sigio_stack, 0); - } - write_sigio_pid = -1; - close(sigio_private[0]); - close(sigio_private[1]); - close(write_sigio_fds[0]); - close(write_sigio_fds[1]); - /* Critical section end */ - um_set_signals_trace(flags); + return NULL; } int __add_sigio_fd(int fd) { - struct pollfd *p; - int err, i, n; - - for (i = 0; i < all_sigio_fds.used; i++) { - if (all_sigio_fds.poll[i].fd == fd) - break; - } - if (i == all_sigio_fds.used) - return -ENOSPC; - - p = &all_sigio_fds.poll[i]; - - for (i = 0; i < current_poll.used; i++) { - if (current_poll.poll[i].fd == fd) - return 0; - } - - n = current_poll.used; - err = need_poll(&next_poll, n + 1); - if (err) - return err; - - memcpy(next_poll.poll, current_poll.poll, - current_poll.used * sizeof(struct pollfd)); - next_poll.poll[n] = *p; - next_poll.used = n + 1; - update_thread(); - - return 0; + struct epoll_event event = { + .data.fd = fd, + .events = EPOLLIN | EPOLLET, + }; + int r; + + CATCH_EINTR(r = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event)); + return r < 0 ? -errno : 0; } - int add_sigio_fd(int fd) { int err; @@ -211,38 +80,11 @@ int add_sigio_fd(int fd) int __ignore_sigio_fd(int fd) { - struct pollfd *p; - int err, i, n = 0; - - /* - * This is called from exitcalls elsewhere in UML - if - * sigio_cleanup has already run, then update_thread will hang - * or fail because the thread is no longer running. - */ - if (write_sigio_pid == -1) - return -EIO; - - for (i = 0; i < current_poll.used; i++) { - if (current_poll.poll[i].fd == fd) - break; - } - if (i == current_poll.used) - return -ENOENT; - - err = need_poll(&next_poll, current_poll.used - 1); - if (err) - return err; - - for (i = 0; i < current_poll.used; i++) { - p = ¤t_poll.poll[i]; - if (p->fd != fd) - next_poll.poll[n++] = *p; - } - next_poll.used = current_poll.used - 1; - - update_thread(); + struct epoll_event event; + int r; - return 0; + CATCH_EINTR(r = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event)); + return r < 0 ? -errno : 0; } int ignore_sigio_fd(int fd) @@ -256,125 +98,37 @@ int ignore_sigio_fd(int fd) return err; } -static struct pollfd *setup_initial_poll(int fd) -{ - struct pollfd *p; - - p = uml_kmalloc(sizeof(struct pollfd), UM_GFP_KERNEL); - if (p == NULL) { - printk(UM_KERN_ERR "setup_initial_poll : failed to allocate " - "poll\n"); - return NULL; - } - *p = ((struct pollfd) { .fd = fd, - .events = POLLIN, - .revents = 0 }); - return p; -} - static void write_sigio_workaround(void) { - struct pollfd *p; int err; - int l_write_sigio_fds[2]; - int l_sigio_private[2]; - int l_write_sigio_pid; - /* We call this *tons* of times - and most ones we must just fail. */ sigio_lock(); - l_write_sigio_pid = write_sigio_pid; - sigio_unlock(); - - if (l_write_sigio_pid != -1) - return; + if (write_sigio_td) + goto out; - err = os_pipe(l_write_sigio_fds, 1, 1); - if (err < 0) { - printk(UM_KERN_ERR "write_sigio_workaround - os_pipe 1 failed, " - "err = %d\n", -err); - return; + epollfd = epoll_create(MAX_EPOLL_EVENTS); + if (epollfd < 0) { + printk(UM_KERN_ERR "%s: epoll_create failed, errno = %d\n", + __func__, errno); + goto out; } - err = os_pipe(l_sigio_private, 1, 1); + + err = os_run_helper_thread(&write_sigio_td, write_sigio_thread, NULL); if (err < 0) { - printk(UM_KERN_ERR "write_sigio_workaround - os_pipe 2 failed, " - "err = %d\n", -err); - goto out_close1; + printk(UM_KERN_ERR "%s: os_run_helper_thread failed, errno = %d\n", + __func__, -err); + close(epollfd); + epollfd = -1; + goto out; } - p = setup_initial_poll(l_sigio_private[1]); - if (!p) - goto out_close2; - - sigio_lock(); - - /* - * Did we race? Don't try to optimize this, please, it's not so likely - * to happen, and no more than once at the boot. - */ - if (write_sigio_pid != -1) - goto out_free; - - current_poll = ((struct pollfds) { .poll = p, - .used = 1, - .size = 1 }); - - if (write_sigio_irq(l_write_sigio_fds[0])) - goto out_clear_poll; - - memcpy(write_sigio_fds, l_write_sigio_fds, sizeof(l_write_sigio_fds)); - memcpy(sigio_private, l_sigio_private, sizeof(l_sigio_private)); - - write_sigio_pid = run_helper_thread(write_sigio_thread, NULL, - CLONE_FILES | CLONE_VM, - &write_sigio_stack); - - if (write_sigio_pid < 0) - goto out_clear; - - sigio_unlock(); - return; - -out_clear: - write_sigio_pid = -1; - write_sigio_fds[0] = -1; - write_sigio_fds[1] = -1; - sigio_private[0] = -1; - sigio_private[1] = -1; -out_clear_poll: - current_poll = ((struct pollfds) { .poll = NULL, - .size = 0, - .used = 0 }); -out_free: +out: sigio_unlock(); - kfree(p); -out_close2: - close(l_sigio_private[0]); - close(l_sigio_private[1]); -out_close1: - close(l_write_sigio_fds[0]); - close(l_write_sigio_fds[1]); } -void sigio_broken(int fd) +void sigio_broken(void) { - int err; - write_sigio_workaround(); - - sigio_lock(); - err = need_poll(&all_sigio_fds, all_sigio_fds.used + 1); - if (err) { - printk(UM_KERN_ERR "maybe_sigio_broken - failed to add pollfd " - "for descriptor %d\n", fd); - goto out; - } - - all_sigio_fds.poll[all_sigio_fds.used++] = - ((struct pollfd) { .fd = fd, - .events = POLLIN, - .revents = 0 }); -out: - sigio_unlock(); } /* Changed during early boot */ @@ -388,17 +142,16 @@ void maybe_sigio_broken(int fd) if (pty_output_sigio) return; - sigio_broken(fd); + sigio_broken(); } static void sigio_cleanup(void) { - if (write_sigio_pid == -1) + if (!write_sigio_td) return; - os_kill_process(write_sigio_pid, 1); - free_stack(write_sigio_stack, 0); - write_sigio_pid = -1; + os_kill_helper_thread(write_sigio_td); + write_sigio_td = NULL; } __uml_exitcall(sigio_cleanup); diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 24a403a70a02..e71e5b4878d1 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -8,6 +8,7 @@ #include <stdlib.h> #include <stdarg.h> +#include <stdbool.h> #include <errno.h> #include <signal.h> #include <string.h> @@ -20,12 +21,12 @@ #include <sys/ucontext.h> #include <timetravel.h> -void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = { +void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *, void *mc) = { [SIGTRAP] = relay_signal, [SIGFPE] = relay_signal, [SIGILL] = relay_signal, [SIGWINCH] = winch, - [SIGBUS] = bus_handler, + [SIGBUS] = relay_signal, [SIGSEGV] = segv_handler, [SIGIO] = sigio_handler, }; @@ -46,7 +47,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) if ((sig != SIGIO) && (sig != SIGWINCH)) unblock_signals_trace(); - (*sig_info[sig])(sig, si, &r); + (*sig_info[sig])(sig, si, &r, mc); errno = save_errno; } @@ -64,26 +65,37 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) #define SIGALRM_MASK (1 << SIGALRM_BIT) int signals_enabled; -#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT -static int signals_blocked; -#else -#define signals_blocked 0 +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) +static int signals_blocked, signals_blocked_pending; #endif static unsigned int signals_pending; static unsigned int signals_active = 0; -void sig_handler(int sig, struct siginfo *si, mcontext_t *mc) +static void sig_handler(int sig, struct siginfo *si, mcontext_t *mc) { int enabled = signals_enabled; - if ((signals_blocked || !enabled) && (sig == SIGIO)) { +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) + if ((signals_blocked || + __atomic_load_n(&signals_blocked_pending, __ATOMIC_SEQ_CST)) && + (sig == SIGIO)) { + /* increment so unblock will do another round */ + __atomic_add_fetch(&signals_blocked_pending, 1, + __ATOMIC_SEQ_CST); + return; + } +#endif + + if (!enabled && (sig == SIGIO)) { /* * In TT_MODE_EXTERNAL, need to still call time-travel - * handlers unless signals are also blocked for the - * external time message processing. This will mark - * signals_pending by itself (only if necessary.) + * handlers. This will mark signals_pending by itself + * (only if necessary.) + * Note we won't get here if signals are hard-blocked + * (which is handled above), in that case the hard- + * unblock will handle things. */ - if (!signals_blocked && time_travel_mode == TT_MODE_EXTERNAL) + if (time_travel_mode == TT_MODE_EXTERNAL) sigio_run_timetravel_handlers(); else signals_pending |= SIGIO_MASK; @@ -108,7 +120,7 @@ static void timer_real_alarm_handler(mcontext_t *mc) timer_handler(SIGALRM, NULL, ®s); } -void timer_alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc) +static void timer_alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc) { int enabled; @@ -178,43 +190,8 @@ static void hard_handler(int sig, siginfo_t *si, void *p) { ucontext_t *uc = p; mcontext_t *mc = &uc->uc_mcontext; - unsigned long pending = 1UL << sig; - - do { - int nested, bail; - - /* - * pending comes back with one bit set for each - * interrupt that arrived while setting up the stack, - * plus a bit for this interrupt, plus the zero bit is - * set if this is a nested interrupt. - * If bail is true, then we interrupted another - * handler setting up the stack. In this case, we - * have to return, and the upper handler will deal - * with this interrupt. - */ - bail = to_irq_stack(&pending); - if (bail) - return; - - nested = pending & 1; - pending &= ~1; - while ((sig = ffs(pending)) != 0){ - sig--; - pending &= ~(1 << sig); - (*handlers[sig])(sig, (struct siginfo *)si, mc); - } - - /* - * Again, pending comes back with a mask of signals - * that arrived while tearing down the stack. If this - * is non-zero, we just go back, set up the stack - * again, and handle the new interrupts. - */ - if (!nested) - pending = from_irq_stack(nested); - } while (pending); + (*handlers[sig])(sig, (struct siginfo *)si, mc); } void set_handler(int sig) @@ -285,7 +262,7 @@ void unblock_signals(void) return; signals_enabled = 1; -#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) deliver_time_travel_irqs(); #endif @@ -377,43 +354,101 @@ int um_set_signals_trace(int enable) return ret; } -#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT +#if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) void mark_sigio_pending(void) { + /* + * It would seem that this should be atomic so + * it isn't a read-modify-write with a signal + * that could happen in the middle, losing the + * value set by the signal. + * + * However, this function is only called when in + * time-travel=ext simulation mode, in which case + * the only signal ever pending is SIGIO, which + * is blocked while this can be called, and the + * timer signal (SIGALRM) cannot happen. + */ signals_pending |= SIGIO_MASK; } void block_signals_hard(void) { - if (signals_blocked) - return; - signals_blocked = 1; + signals_blocked++; barrier(); } void unblock_signals_hard(void) { + static bool unblocking; + if (!signals_blocked) + panic("unblocking signals while not blocked"); + + if (--signals_blocked) return; - /* Must be set to 0 before we check the pending bits etc. */ - signals_blocked = 0; + /* + * Must be set to 0 before we check pending so the + * SIGIO handler will run as normal unless we're still + * going to process signals_blocked_pending. + */ barrier(); - if (signals_pending && signals_enabled) { - /* this is a bit inefficient, but that's not really important */ - block_signals(); - unblock_signals(); - } else if (signals_pending & SIGIO_MASK) { - /* we need to run time-travel handlers even if not enabled */ - sigio_run_timetravel_handlers(); - } -} -#endif + /* + * Note that block_signals_hard()/unblock_signals_hard() can be called + * within the unblock_signals()/sigio_run_timetravel_handlers() below. + * This would still be prone to race conditions since it's actually a + * call _within_ e.g. vu_req_read_message(), where we observed this + * issue, which loops. Thus, if the inner call handles the recorded + * pending signals, we can get out of the inner call with the real + * signal hander no longer blocked, and still have a race. Thus don't + * handle unblocking in the inner call, if it happens, but only in + * the outermost call - 'unblocking' serves as an ownership for the + * signals_blocked_pending decrement. + */ + if (unblocking) + return; + unblocking = true; -int os_is_signal_stack(void) -{ - stack_t ss; - sigaltstack(NULL, &ss); + while (__atomic_load_n(&signals_blocked_pending, __ATOMIC_SEQ_CST)) { + if (signals_enabled) { + /* signals are enabled so we can touch this */ + signals_pending |= SIGIO_MASK; + /* + * this is a bit inefficient, but that's + * not really important + */ + block_signals(); + unblock_signals(); + } else { + /* + * we need to run time-travel handlers even + * if not enabled + */ + sigio_run_timetravel_handlers(); + } - return ss.ss_flags & SS_ONSTACK; + /* + * The decrement of signals_blocked_pending must be atomic so + * that the signal handler will either happen before or after + * the decrement, not during a read-modify-write: + * - If it happens before, it can increment it and we'll + * decrement it and do another round in the loop. + * - If it happens after it'll see 0 for both signals_blocked + * and signals_blocked_pending and thus run the handler as + * usual (subject to signals_enabled, but that's unrelated.) + * + * Note that a call to unblock_signals_hard() within the calls + * to unblock_signals() or sigio_run_timetravel_handlers() above + * will do nothing due to the 'unblocking' state, so this cannot + * underflow as the only one decrementing will be the outermost + * one. + */ + if (__atomic_sub_fetch(&signals_blocked_pending, 1, + __ATOMIC_SEQ_CST) < 0) + panic("signals_blocked_pending underflow"); + } + + unblocking = false; } +#endif diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index 953fb10f3f93..d7f1814b0e5a 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) */ @@ -17,10 +18,32 @@ #include <skas.h> #include <sysdep/ptrace.h> #include <sysdep/stub.h> +#include "../internal.h" -extern char batch_syscall_stub[], __syscall_stub_start[]; +extern char __syscall_stub_start[]; -extern void wait_stub_done(int pid); +void syscall_stub_dump_error(struct mm_id *mm_idp) +{ + struct stub_data *proc_data = (void *)mm_idp->stack; + struct stub_syscall *sc; + + if (proc_data->syscall_data_len < 0 || + proc_data->syscall_data_len >= ARRAY_SIZE(proc_data->syscall_data)) + panic("Syscall data was corrupted by stub (len is: %d, expected maximum: %d)!", + proc_data->syscall_data_len, + mm_idp->syscall_data_len); + + sc = &proc_data->syscall_data[proc_data->syscall_data_len]; + + printk(UM_KERN_ERR "%s : length = %d, last offset = %d", + __func__, mm_idp->syscall_data_len, + proc_data->syscall_data_len); + printk(UM_KERN_ERR "%s : stub syscall type %d failed, return value = 0x%lx\n", + __func__, sc->syscall, proc_data->err); + + print_hex_dump(UM_KERN_ERR, " syscall data: ", 0, + 16, 4, sc, sizeof(*sc), 0); +} static inline unsigned long *check_init_stack(struct mm_id * mm_idp, unsigned long *stack) @@ -37,23 +60,25 @@ static unsigned long syscall_regs[MAX_REG_NR]; static int __init init_syscall_regs(void) { get_safe_registers(syscall_regs, NULL); + syscall_regs[REGS_IP_INDEX] = STUB_CODE + - ((unsigned long) batch_syscall_stub - + ((unsigned long) stub_syscall_handler - (unsigned long) __syscall_stub_start); - syscall_regs[REGS_SP_INDEX] = STUB_DATA; + syscall_regs[REGS_SP_INDEX] = STUB_DATA + + offsetof(struct stub_data, sigstack) + + sizeof(((struct stub_data *) 0)->sigstack) - + sizeof(void *); return 0; } __initcall(init_syscall_regs); -static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr) +static inline long do_syscall_stub(struct mm_id *mm_idp) { + struct stub_data *proc_data = (void *)mm_idp->stack; int n, i; - long ret, offset; - unsigned long * data; - unsigned long * syscall; - int err, pid = mm_idp->u.pid; + int err, pid = mm_idp->pid; n = ptrace_setregs(pid, syscall_regs); if (n < 0) { @@ -64,6 +89,9 @@ static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr) __func__, -n); } + /* Inform process how much we have filled in. */ + proc_data->syscall_data_len = mm_idp->syscall_data_len; + err = ptrace(PTRACE_CONT, pid, 0, 0); if (err) panic("Failed to continue stub, pid = %d, errno = %d\n", pid, @@ -72,135 +100,120 @@ static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr) wait_stub_done(pid); /* - * When the stub stops, we find the following values on the - * beginning of the stack: - * (long )return_value - * (long )offset to failed sycall-data (0, if no error) + * proc_data->err will be non-zero if there was an (unexpected) error. + * In that case, syscall_data_len points to the last executed syscall, + * otherwise it will be zero (but we do not need to rely on that). */ - ret = *((unsigned long *) mm_idp->stack); - offset = *((unsigned long *) mm_idp->stack + 1); - if (offset) { - data = (unsigned long *)(mm_idp->stack + offset - STUB_DATA); - printk(UM_KERN_ERR "%s : ret = %ld, offset = %ld, data = %p\n", - __func__, ret, offset, data); - syscall = (unsigned long *)((unsigned long)data + data[0]); - printk(UM_KERN_ERR "%s: syscall %ld failed, return value = 0x%lx, expected return value = 0x%lx\n", - __func__, syscall[0], ret, syscall[7]); - printk(UM_KERN_ERR " syscall parameters: 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", - syscall[1], syscall[2], syscall[3], - syscall[4], syscall[5], syscall[6]); - for (n = 1; n < data[0]/sizeof(long); n++) { - if (n == 1) - printk(UM_KERN_ERR " additional syscall data:"); - if (n % 4 == 1) - printk("\n" UM_KERN_ERR " "); - printk(" 0x%lx", data[n]); - } - if (n > 1) - printk("\n"); - } - else ret = 0; + if (proc_data->err < 0) { + syscall_stub_dump_error(mm_idp); - *addr = check_init_stack(mm_idp, NULL); + /* Store error code in case someone tries to add more syscalls */ + mm_idp->syscall_data_len = proc_data->err; + } else { + mm_idp->syscall_data_len = 0; + } - return ret; + return mm_idp->syscall_data_len; } -long run_syscall_stub(struct mm_id * mm_idp, int syscall, - unsigned long *args, long expected, void **addr, - int done) +int syscall_stub_flush(struct mm_id *mm_idp) { - unsigned long *stack = check_init_stack(mm_idp, *addr); - - *stack += sizeof(long); - stack += *stack / sizeof(long); - - *stack++ = syscall; - *stack++ = args[0]; - *stack++ = args[1]; - *stack++ = args[2]; - *stack++ = args[3]; - *stack++ = args[4]; - *stack++ = args[5]; - *stack++ = expected; - *stack = 0; - - if (!done && ((((unsigned long) stack) & ~UM_KERN_PAGE_MASK) < - UM_KERN_PAGE_SIZE - 10 * sizeof(long))) { - *addr = stack; + int res; + + if (mm_idp->syscall_data_len == 0) return 0; + + /* If an error happened already, report it and reset the state. */ + if (mm_idp->syscall_data_len < 0) { + res = mm_idp->syscall_data_len; + mm_idp->syscall_data_len = 0; + return res; } - return do_syscall_stub(mm_idp, addr); + res = do_syscall_stub(mm_idp); + mm_idp->syscall_data_len = 0; + + return res; } -long syscall_stub_data(struct mm_id * mm_idp, - unsigned long *data, int data_count, - void **addr, void **stub_addr) +struct stub_syscall *syscall_stub_alloc(struct mm_id *mm_idp) { - unsigned long *stack; - int ret = 0; - - /* - * If *addr still is uninitialized, it *must* contain NULL. - * Thus in this case do_syscall_stub correctly won't be called. - */ - if ((((unsigned long) *addr) & ~UM_KERN_PAGE_MASK) >= - UM_KERN_PAGE_SIZE - (10 + data_count) * sizeof(long)) { - ret = do_syscall_stub(mm_idp, addr); - /* in case of error, don't overwrite data on stack */ - if (ret) - return ret; + struct stub_syscall *sc; + struct stub_data *proc_data = (struct stub_data *) mm_idp->stack; + + if (mm_idp->syscall_data_len > 0 && + mm_idp->syscall_data_len == ARRAY_SIZE(proc_data->syscall_data)) + do_syscall_stub(mm_idp); + + if (mm_idp->syscall_data_len < 0) { + /* Return dummy to retain error state. */ + sc = &proc_data->syscall_data[0]; + } else { + sc = &proc_data->syscall_data[mm_idp->syscall_data_len]; + mm_idp->syscall_data_len += 1; } + memset(sc, 0, sizeof(*sc)); - stack = check_init_stack(mm_idp, *addr); - *addr = stack; - - *stack = data_count * sizeof(long); - - memcpy(stack + 1, data, data_count * sizeof(long)); - - *stub_addr = (void *)(((unsigned long)(stack + 1) & - ~UM_KERN_PAGE_MASK) + STUB_DATA); - - return 0; + return sc; } -int map(struct mm_id * mm_idp, unsigned long virt, unsigned long len, int prot, - int phys_fd, unsigned long long offset, int done, void **data) +static struct stub_syscall *syscall_stub_get_previous(struct mm_id *mm_idp, + int syscall_type, + unsigned long virt) { - int ret; - unsigned long args[] = { virt, len, prot, - MAP_SHARED | MAP_FIXED, phys_fd, - MMAP_OFFSET(offset) }; + if (mm_idp->syscall_data_len > 0) { + struct stub_data *proc_data = (void *) mm_idp->stack; + struct stub_syscall *sc; + + sc = &proc_data->syscall_data[mm_idp->syscall_data_len - 1]; - ret = run_syscall_stub(mm_idp, STUB_MMAP_NR, args, virt, - data, done); + if (sc->syscall == syscall_type && + sc->mem.addr + sc->mem.length == virt) + return sc; + } - return ret; + return NULL; } -int unmap(struct mm_id * mm_idp, unsigned long addr, unsigned long len, - int done, void **data) +int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset) { - int ret; - unsigned long args[] = { (unsigned long) addr, len, 0, 0, 0, - 0 }; + struct stub_syscall *sc; - ret = run_syscall_stub(mm_idp, __NR_munmap, args, 0, - data, done); + /* Compress with previous syscall if that is possible */ + sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MMAP, virt); + if (sc && sc->mem.prot == prot && sc->mem.fd == phys_fd && + sc->mem.offset == MMAP_OFFSET(offset - sc->mem.length)) { + sc->mem.length += len; + return 0; + } + + sc = syscall_stub_alloc(mm_idp); + sc->syscall = STUB_SYSCALL_MMAP; + sc->mem.addr = virt; + sc->mem.length = len; + sc->mem.prot = prot; + sc->mem.fd = phys_fd; + sc->mem.offset = MMAP_OFFSET(offset); - return ret; + return 0; } -int protect(struct mm_id * mm_idp, unsigned long addr, unsigned long len, - unsigned int prot, int done, void **data) +int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len) { - int ret; - unsigned long args[] = { addr, len, prot, 0, 0, 0 }; + struct stub_syscall *sc; - ret = run_syscall_stub(mm_idp, __NR_mprotect, args, 0, - data, done); + /* Compress with previous syscall if that is possible */ + sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MUNMAP, addr); + if (sc) { + sc->mem.length += len; + return 0; + } + + sc = syscall_stub_alloc(mm_idp); + sc->syscall = STUB_SYSCALL_MUNMAP; + sc->mem.addr = addr; + sc->mem.length = len; - return ret; + return 0; } diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 1f5c3f2523d1..ae2aea062f06 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -10,8 +10,11 @@ #include <sched.h> #include <errno.h> #include <string.h> +#include <fcntl.h> +#include <mem_user.h> #include <sys/mman.h> #include <sys/wait.h> +#include <sys/stat.h> #include <asm/unistd.h> #include <as-layout.h> #include <init.h> @@ -23,6 +26,8 @@ #include <skas.h> #include <sysdep/stub.h> #include <linux/threads.h> +#include <timetravel.h> +#include "../internal.h" int is_skas_winch(int pid, int fd, void *data) { @@ -139,16 +144,10 @@ bad_wait: extern unsigned long current_stub_stack(void); -static void get_skas_faultinfo(int pid, struct faultinfo *fi, unsigned long *aux_fp_regs) +static void get_skas_faultinfo(int pid, struct faultinfo *fi) { int err; - err = get_fp_registers(pid, aux_fp_regs); - if (err < 0) { - printk(UM_KERN_ERR "save_fp_registers returned %d\n", - err); - fatal_sigsegv(); - } err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV); if (err) { printk(UM_KERN_ERR "Failed to continue stub, pid = %d, " @@ -162,19 +161,12 @@ static void get_skas_faultinfo(int pid, struct faultinfo *fi, unsigned long *aux * the stub stack page. We just have to copy it. */ memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); - - err = put_fp_registers(pid, aux_fp_regs); - if (err < 0) { - printk(UM_KERN_ERR "put_fp_registers returned %d\n", - err); - fatal_sigsegv(); - } } -static void handle_segv(int pid, struct uml_pt_regs *regs, unsigned long *aux_fp_regs) +static void handle_segv(int pid, struct uml_pt_regs *regs) { - get_skas_faultinfo(pid, ®s->faultinfo, aux_fp_regs); - segv(regs->faultinfo, 0, 1, NULL); + get_skas_faultinfo(pid, ®s->faultinfo); + segv(regs->faultinfo, 0, 1, NULL, NULL); } static void handle_trap(int pid, struct uml_pt_regs *regs) @@ -187,72 +179,143 @@ static void handle_trap(int pid, struct uml_pt_regs *regs) extern char __syscall_stub_start[]; -/** - * userspace_tramp() - userspace trampoline - * @stack: pointer to the new userspace stack page - * - * The userspace trampoline is used to setup a new userspace process in start_userspace() after it was clone()'ed. - * This function will run on a temporary stack page. - * It ptrace()'es itself, then - * Two pages are mapped into the userspace address space: - * - STUB_CODE (with EXEC), which contains the skas stub code - * - STUB_DATA (with R/W), which contains a data page that is used to transfer certain data between the UML userspace process and the UML kernel. - * Also for the userspace process a SIGSEGV handler is installed to catch pagefaults in the userspace process. - * And last the process stops itself to give control to the UML kernel for this userspace process. - * - * Return: Always zero, otherwise the current userspace process is ended with non null exit() call - */ +static int stub_exe_fd; + +#ifndef CLOSE_RANGE_CLOEXEC +#define CLOSE_RANGE_CLOEXEC (1U << 2) +#endif + static int userspace_tramp(void *stack) { - struct sigaction sa; - void *addr; - int fd; + char *const argv[] = { "uml-userspace", NULL }; + int pipe_fds[2]; unsigned long long offset; - unsigned long segv_handler = STUB_CODE + - (unsigned long) stub_segv_handler - - (unsigned long) __syscall_stub_start; - - ptrace(PTRACE_TRACEME, 0, 0, 0); - - signal(SIGTERM, SIG_DFL); - signal(SIGWINCH, SIG_IGN); - - fd = phys_mapping(uml_to_phys(__syscall_stub_start), &offset); - addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE, - PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset); - if (addr == MAP_FAILED) { - os_info("mapping mmap stub at 0x%lx failed, errno = %d\n", - STUB_CODE, errno); - exit(1); + struct stub_init_data init_data = { + .stub_start = STUB_START, + .segv_handler = STUB_CODE + + (unsigned long) stub_segv_handler - + (unsigned long) __syscall_stub_start, + }; + struct iomem_region *iomem; + int ret; + + init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), + &offset); + init_data.stub_code_offset = MMAP_OFFSET(offset); + + init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); + init_data.stub_data_offset = MMAP_OFFSET(offset); + + /* + * Avoid leaking unneeded FDs to the stub by setting CLOEXEC on all FDs + * and then unsetting it on all memory related FDs. + * This is not strictly necessary from a safety perspective. + */ + syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC); + + fcntl(init_data.stub_data_fd, F_SETFD, 0); + for (iomem = iomem_regions; iomem; iomem = iomem->next) + fcntl(iomem->fd, F_SETFD, 0); + + /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */ + if (pipe(pipe_fds)) + exit(2); + + if (dup2(pipe_fds[0], 0) < 0) + exit(3); + close(pipe_fds[0]); + + /* Write init_data and close write side */ + ret = write(pipe_fds[1], &init_data, sizeof(init_data)); + close(pipe_fds[1]); + + if (ret != sizeof(init_data)) + exit(4); + + /* Raw execveat for compatibility with older libc versions */ + syscall(__NR_execveat, stub_exe_fd, (unsigned long)"", + (unsigned long)argv, NULL, AT_EMPTY_PATH); + + exit(5); +} + +extern char stub_exe_start[]; +extern char stub_exe_end[]; + +extern char *tempdir; + +#define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX" + +#ifndef MFD_EXEC +#define MFD_EXEC 0x0010U +#endif + +static int __init init_stub_exe_fd(void) +{ + size_t written = 0; + char *tmpfile = NULL; + + stub_exe_fd = memfd_create("uml-userspace", + MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); + + if (stub_exe_fd < 0) { + printk(UM_KERN_INFO "Could not create executable memfd, using temporary file!"); + + tmpfile = malloc(strlen(tempdir) + + strlen(STUB_EXE_NAME_TEMPLATE) + 1); + if (tmpfile == NULL) + panic("Failed to allocate memory for stub binary name"); + + strcpy(tmpfile, tempdir); + strcat(tmpfile, STUB_EXE_NAME_TEMPLATE); + + stub_exe_fd = mkstemp(tmpfile); + if (stub_exe_fd < 0) + panic("Could not create temporary file for stub binary: %d", + -errno); } - fd = phys_mapping(uml_to_phys(stack), &offset); - addr = mmap((void *) STUB_DATA, - STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_SHARED, fd, offset); - if (addr == MAP_FAILED) { - os_info("mapping segfault stack at 0x%lx failed, errno = %d\n", - STUB_DATA, errno); - exit(1); + while (written < stub_exe_end - stub_exe_start) { + ssize_t res = write(stub_exe_fd, stub_exe_start + written, + stub_exe_end - stub_exe_start - written); + if (res < 0) { + if (errno == EINTR) + continue; + + if (tmpfile) + unlink(tmpfile); + panic("Failed write stub binary: %d", -errno); + } + + written += res; } - set_sigstack((void *) STUB_DATA, STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); - sigemptyset(&sa.sa_mask); - sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; - sa.sa_sigaction = (void *) segv_handler; - sa.sa_restorer = NULL; - if (sigaction(SIGSEGV, &sa, NULL) < 0) { - os_info("%s - setting SIGSEGV handler failed - errno = %d\n", - __func__, errno); - exit(1); + if (!tmpfile) { + fcntl(stub_exe_fd, F_ADD_SEALS, + F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_SEAL); + } else { + if (fchmod(stub_exe_fd, 00500) < 0) { + unlink(tmpfile); + panic("Could not make stub binary executable: %d", + -errno); + } + + close(stub_exe_fd); + stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); + if (stub_exe_fd < 0) { + unlink(tmpfile); + panic("Could not reopen stub binary: %d", -errno); + } + + unlink(tmpfile); + free(tmpfile); } - kill(os_getpid(), SIGSTOP); return 0; } +__initcall(init_stub_exe_fd); int userspace_pid[NR_CPUS]; -int kill_userspace_mm[NR_CPUS]; /** * start_userspace() - prepare a new userspace process @@ -269,7 +332,7 @@ int start_userspace(unsigned long stub_stack) { void *stack; unsigned long sp; - int pid, status, n, flags, err; + int pid, status, n, err; /* setup a temporary stack page */ stack = mmap(NULL, UM_KERN_PAGE_SIZE, @@ -285,10 +348,10 @@ int start_userspace(unsigned long stub_stack) /* set stack pointer to the end of the stack page, so it can grow downwards */ sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; - flags = CLONE_FILES | SIGCHLD; - /* clone into new userspace process */ - pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack); + pid = clone(userspace_tramp, (void *) sp, + CLONE_VFORK | CLONE_VM | SIGCHLD, + (void *)stub_stack); if (pid < 0) { err = -errno; printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", @@ -335,7 +398,10 @@ int start_userspace(unsigned long stub_stack) return err; } -void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) +int unscheduled_userspace_iterations; +extern unsigned long tt_extra_sched_jiffies; + +void userspace(struct uml_pt_regs *regs) { int err, status, op, pid = userspace_pid[0]; siginfo_t si; @@ -344,8 +410,43 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) interrupt_end(); while (1) { - if (kill_userspace_mm[0]) + /* + * When we are in time-travel mode, userspace can theoretically + * do a *lot* of work without being scheduled. The problem with + * this is that it will prevent kernel bookkeeping (primarily + * the RCU) from running and this can for example cause OOM + * situations. + * + * This code accounts a jiffie against the scheduling clock + * after the defined userspace iterations in the same thread. + * By doing so the situation is effectively prevented. + */ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) { +#ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS + if (CONFIG_UML_MAX_USERSPACE_ITERATIONS && + unscheduled_userspace_iterations++ > + CONFIG_UML_MAX_USERSPACE_ITERATIONS) { + tt_extra_sched_jiffies += 1; + unscheduled_userspace_iterations = 0; + } +#endif + } + + time_travel_print_bc_msg(); + + current_mm_sync(); + + /* Flush out any pending syscalls */ + err = syscall_stub_flush(current_mm_id()); + if (err) { + if (err == -ENOMEM) + report_enomem(); + + printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", + __func__, -err); fatal_sigsegv(); + } /* * This can legitimately fail if the process loads a @@ -422,17 +523,17 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) case SIGSEGV: if (PTRACE_FULL_FAULTINFO) { get_skas_faultinfo(pid, - ®s->faultinfo, aux_fp_regs); + ®s->faultinfo); (*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si, - regs); + regs, NULL); } - else handle_segv(pid, regs, aux_fp_regs); + else handle_segv(pid, regs); break; case SIGTRAP + 0x80: handle_trap(pid, regs); break; case SIGTRAP: - relay_signal(SIGTRAP, (struct siginfo *)&si, regs); + relay_signal(SIGTRAP, (struct siginfo *)&si, regs, NULL); break; case SIGALRM: break; @@ -442,7 +543,7 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) case SIGFPE: case SIGWINCH: block_signals_trace(); - (*sig_info[sig])(sig, (struct siginfo *)&si, regs); + (*sig_info[sig])(sig, (struct siginfo *)&si, regs, NULL); unblock_signals_trace(); break; default: @@ -460,113 +561,6 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs) } } -static unsigned long thread_regs[MAX_REG_NR]; -static unsigned long thread_fp_regs[FP_SIZE]; - -static int __init init_thread_regs(void) -{ - get_safe_registers(thread_regs, thread_fp_regs); - /* Set parent's instruction pointer to start of clone-stub */ - thread_regs[REGS_IP_INDEX] = STUB_CODE + - (unsigned long) stub_clone_handler - - (unsigned long) __syscall_stub_start; - thread_regs[REGS_SP_INDEX] = STUB_DATA + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - - sizeof(void *); -#ifdef __SIGNAL_FRAMESIZE - thread_regs[REGS_SP_INDEX] -= __SIGNAL_FRAMESIZE; -#endif - return 0; -} - -__initcall(init_thread_regs); - -int copy_context_skas0(unsigned long new_stack, int pid) -{ - int err; - unsigned long current_stack = current_stub_stack(); - struct stub_data *data = (struct stub_data *) current_stack; - struct stub_data *child_data = (struct stub_data *) new_stack; - unsigned long long new_offset; - int new_fd = phys_mapping(uml_to_phys((void *)new_stack), &new_offset); - - /* - * prepare offset and fd of child's stack as argument for parent's - * and child's mmap2 calls - */ - *data = ((struct stub_data) { - .offset = MMAP_OFFSET(new_offset), - .fd = new_fd, - .parent_err = -ESRCH, - .child_err = 0, - }); - - *child_data = ((struct stub_data) { - .child_err = -ESRCH, - }); - - err = ptrace_setregs(pid, thread_regs); - if (err < 0) { - err = -errno; - printk(UM_KERN_ERR "%s : PTRACE_SETREGS failed, pid = %d, errno = %d\n", - __func__, pid, -err); - return err; - } - - err = put_fp_registers(pid, thread_fp_regs); - if (err < 0) { - printk(UM_KERN_ERR "%s : put_fp_registers failed, pid = %d, err = %d\n", - __func__, pid, err); - return err; - } - - /* - * Wait, until parent has finished its work: read child's pid from - * parent's stack, and check, if bad result. - */ - err = ptrace(PTRACE_CONT, pid, 0, 0); - if (err) { - err = -errno; - printk(UM_KERN_ERR "Failed to continue new process, pid = %d, errno = %d\n", - pid, errno); - return err; - } - - wait_stub_done(pid); - - pid = data->parent_err; - if (pid < 0) { - printk(UM_KERN_ERR "%s - stub-parent reports error %d\n", - __func__, -pid); - return pid; - } - - /* - * Wait, until child has finished too: read child's result from - * child's stack and check it. - */ - wait_stub_done(pid); - if (child_data->child_err != STUB_DATA) { - printk(UM_KERN_ERR "%s - stub-child %d reports error %ld\n", - __func__, pid, data->child_err); - err = data->child_err; - goto out_kill; - } - - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, - (void *)PTRACE_O_TRACESYSGOOD) < 0) { - err = -errno; - printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", - __func__, errno); - goto out_kill; - } - - return pid; - - out_kill: - os_kill_ptraced_process(pid, 1); - return err; -} - void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) { (*buf)[0].JB_IP = (unsigned long) handler; @@ -581,6 +575,8 @@ void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)) void switch_threads(jmp_buf *me, jmp_buf *you) { + unscheduled_userspace_iterations = 0; + if (UML_SETJMP(me) == 0) UML_LONGJMP(you, 1); } @@ -664,6 +660,7 @@ static bool noreboot; static int __init noreboot_cmd_param(char *str, int *add) { + *add = 0; noreboot = true; return 0; } @@ -682,6 +679,5 @@ void reboot_skas(void) void __switch_mm(struct mm_id *mm_idp) { - userspace_pid[0] = mm_idp->u.pid; - kill_userspace_mm[0] = mm_idp->kill; + userspace_pid[0] = mm_idp->pid; } diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index 8b0e98ab842c..93fc82c01aba 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -17,13 +17,16 @@ #include <sys/wait.h> #include <sys/time.h> #include <sys/resource.h> +#include <asm/ldt.h> #include <asm/unistd.h> #include <init.h> #include <os.h> +#include <kern_util.h> #include <mem_user.h> #include <ptrace_user.h> #include <registers.h> #include <skas.h> +#include "internal.h" static void ptrace_child(void) { @@ -221,8 +224,6 @@ static void __init check_ptrace(void) check_sysemu(); } -extern void check_tmpexec(void); - static void __init check_coredump_limit(void) { struct rlimit lim; diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c index e09d65b05d1c..eb523ab1e218 100644 --- a/arch/um/os-Linux/umid.c +++ b/arch/um/os-Linux/umid.c @@ -358,6 +358,8 @@ char *get_umid(void) static int __init set_uml_dir(char *name, int *add) { + *add = 0; + if (*name == '\0') { os_warn("uml_dir can't be an empty string\n"); return 0; diff --git a/arch/um/os-Linux/util.c b/arch/um/os-Linux/util.c index 1dca4ffbd572..4193e04d7e4a 100644 --- a/arch/um/os-Linux/util.c +++ b/arch/um/os-Linux/util.c @@ -52,8 +52,8 @@ void setup_machinename(char *machine_out) struct utsname host; uname(&host); -#ifdef UML_CONFIG_UML_X86 -# ifndef UML_CONFIG_64BIT +#if IS_ENABLED(CONFIG_UML_X86) +# if !IS_ENABLED(CONFIG_64BIT) if (!strcmp(host.machine, "x86_64")) { strcpy(machine_out, "i686"); return; |