diff options
Diffstat (limited to 'arch/um/kernel')
32 files changed, 866 insertions, 1101 deletions
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 811188be954c..4df1cd0d2017 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -17,7 +17,7 @@ extra-y := vmlinux.lds obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \ physmem.o process.o ptrace.o reboot.o sigio.o \ signal.o sysrq.o time.o tlb.o trap.o \ - um_arch.o umid.o maccess.o kmsg_dump.o capflags.o skas/ + um_arch.o umid.o kmsg_dump.o capflags.o skas/ obj-y += load_file.o obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o @@ -47,7 +47,7 @@ $(obj)/config.c: $(src)/config.c.in $(obj)/config.tmp FORCE $(call if_changed,quote2) quiet_cmd_mkcapflags = MKCAP $@ - cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/../../x86/kernel/cpu/mkcapflags.sh $@ $^ + cmd_mkcapflags = $(CONFIG_SHELL) $(src)/../../x86/kernel/cpu/mkcapflags.sh $@ $^ cpufeature = $(src)/../../x86/include/asm/cpufeatures.h vmxfeature = $(src)/../../x86/include/asm/vmxfeatures.h diff --git a/arch/um/kernel/dtb.c b/arch/um/kernel/dtb.c index 4954188a6a09..15c342426489 100644 --- a/arch/um/kernel/dtb.c +++ b/arch/um/kernel/dtb.c @@ -17,7 +17,7 @@ void uml_dtb_init(void) area = uml_load_file(dtb, &size); if (area) { - if (!early_init_dt_scan(area)) { + if (!early_init_dt_scan(area, __pa(area))) { pr_err("invalid DTB %s\n", dtb); memblock_free(area, size); return; @@ -31,6 +31,7 @@ void uml_dtb_init(void) static int __init uml_dtb_setup(char *line, int *add) { + *add = 0; dtb = line; return 0; } diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index 3385d653ebd0..a36b7918a011 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -116,8 +116,6 @@ SECTIONS .fini_array : { *(.fini_array) } .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) - . = ALIGN(KERNEL_STACK_SIZE); - *(.data..init_irqstack) DATA_DATA *(.data.* .gnu.linkonce.d.*) SORT(CONSTRUCTORS) @@ -178,3 +176,6 @@ SECTIONS DISCARDS } + +ASSERT(__syscall_stub_end - __syscall_stub_start <= PAGE_SIZE, + "STUB code must not be larger than one page"); diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 827a0d3fa589..cb8b5cd9285c 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -22,17 +22,8 @@ void flush_thread(void) { - void *data = NULL; - int ret; - arch_flush_thread(¤t->thread.arch); - ret = unmap(¤t->mm->context.id, 0, TASK_SIZE, 1, &data); - if (ret) { - printk(KERN_ERR "%s - clearing address space failed, err = %d\n", - __func__, ret); - force_sig(SIGKILL); - } get_safe_registers(current_pt_regs()->regs.gp, current_pt_regs()->regs.fp); @@ -44,8 +35,5 @@ void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp) PT_REGS_IP(regs) = eip; PT_REGS_SP(regs) = esp; clear_thread_flag(TIF_SINGLESTEP); -#ifdef SUBARCH_EXECVE1 - SUBARCH_EXECVE1(regs->regs); -#endif } EXPORT_SYMBOL(start_thread); diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c index 47b8cb1a1156..99dba827461c 100644 --- a/arch/um/kernel/initrd.c +++ b/arch/um/kernel/initrd.c @@ -34,6 +34,7 @@ int __init read_initrd(void) static int __init uml_initrd_setup(char *line, int *add) { + *add = 0; initrd = line; return 0; } diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 635d44606bfe..abe8f30a521c 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -37,7 +37,7 @@ struct irq_reg { bool pending; bool wakeup; #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT - bool pending_on_resume; + bool pending_event; void (*timetravel_handler)(int, int, void *, struct time_travel_event *); struct time_travel_event event; @@ -52,10 +52,13 @@ struct irq_entry { bool sigio_workaround; }; -static DEFINE_SPINLOCK(irq_lock); +static DEFINE_RAW_SPINLOCK(irq_lock); static LIST_HEAD(active_fds); static DECLARE_BITMAP(irqs_allocated, UM_LAST_SIGNAL_IRQ); static bool irqs_suspended; +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +static bool irqs_pending; +#endif static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs) { @@ -84,9 +87,12 @@ static void irq_event_handler(struct time_travel_event *ev) { struct irq_reg *reg = container_of(ev, struct irq_reg, event); - /* do nothing if suspended - just to cause a wakeup */ - if (irqs_suspended) + /* do nothing if suspended; just cause a wakeup and mark as pending */ + if (irqs_suspended) { + irqs_pending = true; + reg->pending_event = true; return; + } generic_handle_irq(reg->irq); } @@ -110,16 +116,47 @@ static bool irq_do_timetravel_handler(struct irq_entry *entry, if (!reg->event.pending) return false; - if (irqs_suspended) - reg->pending_on_resume = true; return true; } + +static void irq_do_pending_events(bool timetravel_handlers_only) +{ + struct irq_entry *entry; + + if (!irqs_pending || timetravel_handlers_only) + return; + + irqs_pending = false; + + list_for_each_entry(entry, &active_fds, list) { + enum um_irq_type t; + + for (t = 0; t < NUM_IRQ_TYPES; t++) { + struct irq_reg *reg = &entry->reg[t]; + + /* + * Any timetravel_handler was invoked already, just + * directly run the IRQ. + */ + if (reg->pending_event) { + irq_enter(); + generic_handle_irq(reg->irq); + irq_exit(); + reg->pending_event = false; + } + } + } +} #else static bool irq_do_timetravel_handler(struct irq_entry *entry, enum um_irq_type t) { return false; } + +static void irq_do_pending_events(bool timetravel_handlers_only) +{ +} #endif static void sigio_reg_handler(int idx, struct irq_entry *entry, enum um_irq_type t, @@ -145,6 +182,8 @@ static void sigio_reg_handler(int idx, struct irq_entry *entry, enum um_irq_type */ if (timetravel_handlers_only) { #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + reg->pending_event = true; + irqs_pending = true; mark_sigio_pending(); #endif return; @@ -162,6 +201,10 @@ static void _sigio_handler(struct uml_pt_regs *regs, if (timetravel_handlers_only && !um_irq_timetravel_handler_used()) return; + /* Flush out pending events that were ignored due to time-travel. */ + if (!irqs_suspended) + irq_do_pending_events(timetravel_handlers_only); + while (1) { /* This is now lockless - epoll keeps back-referencesto the irqs * which have trigger it so there is no need to walk the irq @@ -193,9 +236,12 @@ static void _sigio_handler(struct uml_pt_regs *regs, free_irqs(); } -void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { + preempt_disable(); _sigio_handler(regs, irqs_suspended); + preempt_enable(); } static struct irq_entry *get_irq_entry_by_fd(int fd) @@ -212,7 +258,7 @@ static struct irq_entry *get_irq_entry_by_fd(int fd) return NULL; } -static void free_irq_entry(struct irq_entry *to_free, bool remove) +static void remove_irq_entry(struct irq_entry *to_free, bool remove) { if (!to_free) return; @@ -220,7 +266,6 @@ static void free_irq_entry(struct irq_entry *to_free, bool remove) if (remove) os_del_epoll_fd(to_free->fd); list_del(&to_free->list); - kfree(to_free); } static bool update_irq_entry(struct irq_entry *entry) @@ -241,17 +286,19 @@ static bool update_irq_entry(struct irq_entry *entry) return false; } -static void update_or_free_irq_entry(struct irq_entry *entry) +static struct irq_entry *update_or_remove_irq_entry(struct irq_entry *entry) { - if (!update_irq_entry(entry)) - free_irq_entry(entry, false); + if (update_irq_entry(entry)) + return NULL; + remove_irq_entry(entry, false); + return entry; } static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, void (*timetravel_handler)(int, int, void *, struct time_travel_event *)) { - struct irq_entry *irq_entry; + struct irq_entry *irq_entry, *to_free = NULL; int err, events = os_event_mask(type); unsigned long flags; @@ -259,9 +306,10 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, if (err < 0) goto out; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); irq_entry = get_irq_entry_by_fd(fd); if (irq_entry) { +already: /* cannot register the same FD twice with the same type */ if (WARN_ON(irq_entry->reg[type].events)) { err = -EALREADY; @@ -271,11 +319,22 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, /* temporarily disable to avoid IRQ-side locking */ os_del_epoll_fd(fd); } else { - irq_entry = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); - if (!irq_entry) { - err = -ENOMEM; - goto out_unlock; + struct irq_entry *new; + + /* don't restore interrupts */ + raw_spin_unlock(&irq_lock); + new = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); + if (!new) { + local_irq_restore(flags); + return -ENOMEM; + } + raw_spin_lock(&irq_lock); + irq_entry = get_irq_entry_by_fd(fd); + if (irq_entry) { + to_free = new; + goto already; } + irq_entry = new; irq_entry->fd = fd; list_add_tail(&irq_entry->list, &active_fds); maybe_sigio_broken(fd); @@ -294,12 +353,11 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, #endif WARN_ON(!update_irq_entry(irq_entry)); - spin_unlock_irqrestore(&irq_lock, flags); - - return 0; + err = 0; out_unlock: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); out: + kfree(to_free); return err; } @@ -313,19 +371,20 @@ void free_irq_by_fd(int fd) struct irq_entry *to_free; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); to_free = get_irq_entry_by_fd(fd); - free_irq_entry(to_free, true); - spin_unlock_irqrestore(&irq_lock, flags); + remove_irq_entry(to_free, true); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } EXPORT_SYMBOL(free_irq_by_fd); static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) { - struct irq_entry *entry; + struct irq_entry *entry, *to_free = NULL; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type i; @@ -341,12 +400,13 @@ static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) os_del_epoll_fd(entry->fd); reg->events = 0; - update_or_free_irq_entry(entry); + to_free = update_or_remove_irq_entry(entry); goto out; } } out: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } void deactivate_fd(int fd, int irqnum) @@ -357,7 +417,7 @@ void deactivate_fd(int fd, int irqnum) os_del_epoll_fd(fd); - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); entry = get_irq_entry_by_fd(fd); if (!entry) goto out; @@ -369,9 +429,10 @@ void deactivate_fd(int fd, int irqnum) entry->reg[i].events = 0; } - update_or_free_irq_entry(entry); + entry = update_or_remove_irq_entry(entry); out: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(entry); ignore_sigio_fd(fd); } @@ -501,7 +562,7 @@ void um_irqs_suspend(void) irqs_suspended = true; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type t; bool clear = true; @@ -534,7 +595,7 @@ void um_irqs_suspend(void) !__ignore_sigio_fd(entry->fd); } } - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); } void um_irqs_resume(void) @@ -543,30 +604,7 @@ void um_irqs_resume(void) unsigned long flags; - local_irq_save(flags); -#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT - /* - * We don't need to lock anything here since we're in resume - * and nothing else is running, but have disabled IRQs so we - * don't try anything else with the interrupt list from there. - */ - list_for_each_entry(entry, &active_fds, list) { - enum um_irq_type t; - - for (t = 0; t < NUM_IRQ_TYPES; t++) { - struct irq_reg *reg = &entry->reg[t]; - - if (reg->pending_on_resume) { - irq_enter(); - generic_handle_irq(reg->irq); - irq_exit(); - reg->pending_on_resume = false; - } - } - } -#endif - - spin_lock(&irq_lock); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { if (entry->suspended) { int err = os_set_fd_async(entry->fd); @@ -580,7 +618,7 @@ void um_irqs_resume(void) } } } - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); irqs_suspended = false; send_sigio_to_self(); @@ -591,7 +629,7 @@ static int normal_irq_set_wake(struct irq_data *d, unsigned int on) struct irq_entry *entry; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type t; @@ -606,7 +644,7 @@ static int normal_irq_set_wake(struct irq_data *d, unsigned int on) } } unlock: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); return 0; } #else @@ -652,115 +690,3 @@ void __init init_IRQ(void) /* Initialize EPOLL Loop */ os_setup_epoll(); } - -/* - * IRQ stack entry and exit: - * - * Unlike i386, UML doesn't receive IRQs on the normal kernel stack - * and switch over to the IRQ stack after some preparation. We use - * sigaltstack to receive signals on a separate stack from the start. - * These two functions make sure the rest of the kernel won't be too - * upset by being on a different stack. The IRQ stack has a - * thread_info structure at the bottom so that current et al continue - * to work. - * - * to_irq_stack copies the current task's thread_info to the IRQ stack - * thread_info and sets the tasks's stack to point to the IRQ stack. - * - * from_irq_stack copies the thread_info struct back (flags may have - * been modified) and resets the task's stack pointer. - * - * Tricky bits - - * - * What happens when two signals race each other? UML doesn't block - * signals with sigprocmask, SA_DEFER, or sa_mask, so a second signal - * could arrive while a previous one is still setting up the - * thread_info. - * - * There are three cases - - * The first interrupt on the stack - sets up the thread_info and - * handles the interrupt - * A nested interrupt interrupting the copying of the thread_info - - * can't handle the interrupt, as the stack is in an unknown state - * A nested interrupt not interrupting the copying of the - * thread_info - doesn't do any setup, just handles the interrupt - * - * The first job is to figure out whether we interrupted stack setup. - * This is done by xchging the signal mask with thread_info->pending. - * If the value that comes back is zero, then there is no setup in - * progress, and the interrupt can be handled. If the value is - * non-zero, then there is stack setup in progress. In order to have - * the interrupt handled, we leave our signal in the mask, and it will - * be handled by the upper handler after it has set up the stack. - * - * Next is to figure out whether we are the outer handler or a nested - * one. As part of setting up the stack, thread_info->real_thread is - * set to non-NULL (and is reset to NULL on exit). This is the - * nesting indicator. If it is non-NULL, then the stack is already - * set up and the handler can run. - */ - -static unsigned long pending_mask; - -unsigned long to_irq_stack(unsigned long *mask_out) -{ - struct thread_info *ti; - unsigned long mask, old; - int nested; - - mask = xchg(&pending_mask, *mask_out); - if (mask != 0) { - /* - * If any interrupts come in at this point, we want to - * make sure that their bits aren't lost by our - * putting our bit in. So, this loop accumulates bits - * until xchg returns the same value that we put in. - * When that happens, there were no new interrupts, - * and pending_mask contains a bit for each interrupt - * that came in. - */ - old = *mask_out; - do { - old |= mask; - mask = xchg(&pending_mask, old); - } while (mask != old); - return 1; - } - - ti = current_thread_info(); - nested = (ti->real_thread != NULL); - if (!nested) { - struct task_struct *task; - struct thread_info *tti; - - task = cpu_tasks[ti->cpu].task; - tti = task_thread_info(task); - - *ti = *tti; - ti->real_thread = tti; - task->stack = ti; - } - - mask = xchg(&pending_mask, 0); - *mask_out |= mask | nested; - return 0; -} - -unsigned long from_irq_stack(int nested) -{ - struct thread_info *ti, *to; - unsigned long mask; - - ti = current_thread_info(); - - pending_mask = 1; - - to = ti->real_thread; - current->stack = to; - ti->real_thread = NULL; - *to = *ti; - - mask = xchg(&pending_mask, 0); - return mask & ~1; -} - diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c index 427dd5a61a38..419021175272 100644 --- a/arch/um/kernel/kmsg_dump.c +++ b/arch/um/kernel/kmsg_dump.c @@ -8,7 +8,7 @@ #include <os.h> static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) + struct kmsg_dump_detail *detail) { static struct kmsg_dump_iter iter; static DEFINE_SPINLOCK(lock); @@ -57,7 +57,7 @@ static struct kmsg_dumper kmsg_dumper = { .dump = kmsg_dumper_stdout }; -int __init kmsg_dumper_stdout_init(void) +static int __init kmsg_dumper_stdout_init(void) { return kmsg_dump_register(&kmsg_dumper); } diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c index 3a85bde3e173..f2fb77da08cf 100644 --- a/arch/um/kernel/ksyms.c +++ b/arch/um/kernel/ksyms.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL(os_shutdown_socket); EXPORT_SYMBOL(os_create_unix_socket); EXPORT_SYMBOL(os_connect_socket); EXPORT_SYMBOL(os_accept_connection); -EXPORT_SYMBOL(os_rcv_fd); +EXPORT_SYMBOL(os_rcv_fd_msg); EXPORT_SYMBOL(run_helper); EXPORT_SYMBOL(os_major); EXPORT_SYMBOL(os_minor); diff --git a/arch/um/kernel/load_file.c b/arch/um/kernel/load_file.c index 5cecd0e291fb..cb9d178ab7d8 100644 --- a/arch/um/kernel/load_file.c +++ b/arch/um/kernel/load_file.c @@ -48,9 +48,7 @@ void *uml_load_file(const char *filename, unsigned long long *size) return NULL; } - area = memblock_alloc(*size, SMP_CACHE_BYTES); - if (!area) - panic("%s: Failed to allocate %llu bytes\n", __func__, *size); + area = memblock_alloc_or_panic(*size, SMP_CACHE_BYTES); if (__uml_load_file(filename, area, *size)) { memblock_free(area, *size); diff --git a/arch/um/kernel/maccess.c b/arch/um/kernel/maccess.c deleted file mode 100644 index 8ccd56813f68..000000000000 --- a/arch/um/kernel/maccess.c +++ /dev/null @@ -1,19 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2013 Richard Weinberger <richrd@nod.at> - */ - -#include <linux/uaccess.h> -#include <linux/kernel.h> -#include <os.h> - -bool copy_from_kernel_nofault_allowed(const void *src, size_t size) -{ - void *psrc = (void *)rounddown((unsigned long)src, PAGE_SIZE); - - if ((unsigned long)src < PAGE_SIZE || size <= 0) - return false; - if (os_mincore(psrc, size + src - psrc) <= 0) - return false; - return true; -} diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 38d5a71a579b..76bec7de81b5 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -6,18 +6,20 @@ #include <linux/stddef.h> #include <linux/module.h> #include <linux/memblock.h> -#include <linux/highmem.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> -#include <asm/fixmap.h> +#include <linux/init.h> +#include <asm/sections.h> #include <asm/page.h> +#include <asm/pgalloc.h> #include <as-layout.h> #include <init.h> #include <kern.h> #include <kern_util.h> #include <mem_user.h> #include <os.h> +#include <um_malloc.h> #include <linux/sched/task.h> #ifdef CONFIG_KASAN @@ -49,14 +51,12 @@ EXPORT_SYMBOL(empty_zero_page); pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* Initialized at boot time, and readonly after that */ -unsigned long long highmem; -EXPORT_SYMBOL(highmem); int kmalloc_ok = 0; /* Used during early boot */ static unsigned long brk_end; -void __init mem_init(void) +void __init arch_mm_preinit(void) { /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); @@ -68,14 +68,16 @@ void __init mem_init(void) map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); memblock_free((void *)brk_end, uml_reserved - brk_end); uml_reserved = brk_end; - - /* this will put all low memory onto the freelists */ - memblock_free_all(); - max_low_pfn = totalram_pages(); + min_low_pfn = PFN_UP(__pa(uml_reserved)); max_pfn = max_low_pfn; +} + +void __init mem_init(void) +{ kmalloc_ok = 1; } +#if IS_ENABLED(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) /* * Create a page table and place a pointer to it in a middle page * directory entry. @@ -97,7 +99,7 @@ static void __init one_page_table_init(pmd_t *pmd) static void __init one_md_table_init(pud_t *pud) { -#ifdef CONFIG_3_LEVEL_PGTABLES +#if CONFIG_PGTABLE_LEVELS > 2 pmd_t *pmd_table = (pmd_t *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); if (!pmd_table) panic("%s: Failed to allocate %lu bytes align=%lx\n", @@ -108,6 +110,19 @@ static void __init one_md_table_init(pud_t *pud) #endif } +static void __init one_ud_table_init(p4d_t *p4d) +{ +#if CONFIG_PGTABLE_LEVELS > 3 + pud_t *pud_table = (pud_t *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); + if (!pud_table) + panic("%s: Failed to allocate %lu bytes align=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE); + + set_p4d(p4d, __p4d(_KERNPG_TABLE + (unsigned long) __pa(pud_table))); + BUG_ON(pud_table != pud_offset(p4d, 0)); +#endif +} + static void __init fixrange_init(unsigned long start, unsigned long end, pgd_t *pgd_base) { @@ -125,6 +140,8 @@ static void __init fixrange_init(unsigned long start, unsigned long end, for ( ; (i < PTRS_PER_PGD) && (vaddr < end); pgd++, i++) { p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) + one_ud_table_init(p4d); pud = pud_offset(p4d, vaddr); if (pud_none(*pud)) one_md_table_init(pud); @@ -139,7 +156,6 @@ static void __init fixrange_init(unsigned long start, unsigned long end, static void __init fixaddr_user_init( void) { -#ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA long size = FIXADDR_USER_END - FIXADDR_USER_START; pte_t *pte; phys_t p; @@ -161,13 +177,12 @@ static void __init fixaddr_user_init( void) pte = virt_to_kpte(vaddr); pte_set_val(*pte, p, PAGE_READONLY); } -#endif } +#endif void __init paging_init(void) { unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - unsigned long vaddr; empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); @@ -178,14 +193,9 @@ void __init paging_init(void) max_zone_pfn[ZONE_NORMAL] = end_iomem >> PAGE_SHIFT; free_area_init(max_zone_pfn); - /* - * Fixed mappings, only the page table structure has to be - * created - mappings will be set by set_fixmap(): - */ - vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir); - +#if IS_ENABLED(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) fixaddr_user_init(); +#endif } /* @@ -201,14 +211,13 @@ void free_initmem(void) pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + pgd_t *pgd = __pgd_alloc(mm, 0); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + if (pgd) memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } + return pgd; } @@ -236,3 +245,11 @@ static const pgprot_t protection_map[16] = { [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED }; DECLARE_VM_GET_PAGE_PROT + +void mark_rodata_ro(void) +{ + unsigned long rodata_start = PFN_ALIGN(__start_rodata); + unsigned long rodata_end = PFN_ALIGN(__end_rodata); + + os_protect_memory((void *)rodata_start, rodata_end - rodata_start, 1, 0, 0); +} diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index 91485119ae67..af02b5f9911d 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -12,6 +12,7 @@ #include <as-layout.h> #include <init.h> #include <kern.h> +#include <kern_util.h> #include <mem_user.h> #include <os.h> @@ -21,23 +22,6 @@ static int physmem_fd = -1; unsigned long high_physmem; EXPORT_SYMBOL(high_physmem); -extern unsigned long long physmem_size; - -void __init mem_total_pages(unsigned long physmem, unsigned long iomem, - unsigned long highmem) -{ - unsigned long phys_pages, highmem_pages; - unsigned long iomem_pages, total_pages; - - phys_pages = physmem >> PAGE_SHIFT; - iomem_pages = iomem >> PAGE_SHIFT; - highmem_pages = highmem >> PAGE_SHIFT; - - total_pages = phys_pages + iomem_pages + highmem_pages; - - max_mapnr = total_pages; -} - void map_memory(unsigned long virt, unsigned long phys, unsigned long len, int r, int w, int x) { @@ -63,13 +47,12 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len, * @reserve_end: end address of the physical kernel memory. * @len: Length of total physical memory that should be mapped/made * available, in bytes. - * @highmem: Number of highmem bytes that should be mapped/made available. * - * Creates an unlinked temporary file of size (len + highmem) and memory maps + * Creates an unlinked temporary file of size (len) and memory maps * it on the last executable image address (uml_reserved). * * The offset is needed as the length of the total physical memory - * (len + highmem) includes the size of the memory used be the executable image, + * (len) includes the size of the memory used be the executable image, * but the mapped-to address is the last address of the executable image * (uml_reserved == end address of executable image). * @@ -77,24 +60,24 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len, * of all user space processes/kernel tasks. */ void __init setup_physmem(unsigned long start, unsigned long reserve_end, - unsigned long len, unsigned long long highmem) + unsigned long len) { unsigned long reserve = reserve_end - start; - long map_size = len - reserve; + unsigned long map_size = len - reserve; int err; - if(map_size <= 0) { + if (len <= reserve) { os_warn("Too few physical memory! Needed=%lu, given=%lu\n", reserve, len); exit(1); } - physmem_fd = create_mem_file(len + highmem); + physmem_fd = create_mem_file(len); err = os_map_memory((void *) reserve_end, physmem_fd, reserve, map_size, 1, 1, 1); if (err < 0) { - os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p " + os_warn("setup_physmem - mapping %lu bytes of memory at 0x%p " "failed - errno = %d\n", map_size, (void *) reserve_end, err); exit(1); @@ -106,9 +89,8 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, */ os_seek_file(physmem_fd, __pa(__syscall_stub_start)); os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); - os_fsync_file(physmem_fd); - memblock_add(__pa(start), len + highmem); + memblock_add(__pa(start), len); memblock_reserve(__pa(start), reserve); min_low_pfn = PFN_UP(__pa(reserve_end)); @@ -136,10 +118,6 @@ int phys_mapping(unsigned long phys, unsigned long long *offset_out) region = region->next; } } - else if (phys < __pa(end_iomem) + highmem) { - fd = physmem_fd; - *offset_out = phys - iomem_size; - } return fd; } @@ -148,6 +126,8 @@ EXPORT_SYMBOL(phys_mapping); static int __init uml_mem_setup(char *line, int *add) { char *retptr; + + *add = 0; physmem_size = memparse(line,&retptr); return 0; } @@ -161,8 +141,6 @@ __uml_setup("mem=", uml_mem_setup, " Example: mem=64M\n\n" ); -extern int __init parse_iomem(char *str, int *add); - __uml_setup("iomem=", parse_iomem, "iomem=<name>,<file>\n" " Configure <file> as an IO memory region named <name>.\n\n" diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index ab95648e93e1..0cd6fad3d908 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -15,6 +15,7 @@ #include <linux/proc_fs.h> #include <linux/ptrace.h> #include <linux/random.h> +#include <linux/cpu.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/debug.h> @@ -26,6 +27,8 @@ #include <linux/resume_user_mode.h> #include <asm/current.h> #include <asm/mmu_context.h> +#include <asm/switch_to.h> +#include <asm/exec.h> #include <linux/uaccess.h> #include <as-layout.h> #include <kern_util.h> @@ -40,24 +43,8 @@ * cares about its entry, so it's OK if another processor is modifying its * entry. */ -struct cpu_task cpu_tasks[NR_CPUS] = { [0 ... NR_CPUS - 1] = { -1, NULL } }; - -static inline int external_pid(void) -{ - /* FIXME: Need to look up userspace_pid by cpu */ - return userspace_pid[0]; -} - -int pid_to_processor_id(int pid) -{ - int i; - - for (i = 0; i < ncpus; i++) { - if (cpu_tasks[i].pid == pid) - return i; - } - return -1; -} +struct task_struct *cpu_tasks[NR_CPUS]; +EXPORT_SYMBOL(cpu_tasks); void free_stack(unsigned long stack, int order) { @@ -78,13 +65,10 @@ unsigned long alloc_stack(int order, int atomic) static inline void set_current(struct task_struct *task) { - cpu_tasks[task_thread_info(task)->cpu] = ((struct cpu_task) - { external_pid(), task }); + cpu_tasks[task_thread_info(task)->cpu] = task; } -extern void arch_switch_to(struct task_struct *to); - -void *__switch_to(struct task_struct *from, struct task_struct *to) +struct task_struct *__switch_to(struct task_struct *from, struct task_struct *to) { to->thread.prev_sched = from; set_current(to); @@ -119,28 +103,26 @@ int get_current_pid(void) */ void new_thread_handler(void) { - int (*fn)(void *), n; + int (*fn)(void *); void *arg; if (current->thread.prev_sched != NULL) schedule_tail(current->thread.prev_sched); current->thread.prev_sched = NULL; - fn = current->thread.request.u.thread.proc; - arg = current->thread.request.u.thread.arg; + fn = current->thread.request.thread.proc; + arg = current->thread.request.thread.arg; /* * callback returns only if the kernel thread execs a process */ - n = fn(arg); - userspace(¤t->thread.regs.regs, current_thread_info()->aux_fp_regs); + fn(arg); + userspace(¤t->thread.regs.regs); } /* Called magically, see new_thread_handler above */ -void fork_handler(void) +static void fork_handler(void) { - force_flush_all(); - schedule_tail(current->thread.prev_sched); /* @@ -152,7 +134,7 @@ void fork_handler(void) current->thread.prev_sched = NULL; - userspace(¤t->thread.regs.regs, current_thread_info()->aux_fp_regs); + userspace(¤t->thread.regs.regs); } int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) @@ -177,8 +159,8 @@ int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) arch_copy_thread(¤t->thread.arch, &p->thread.arch); } else { get_safe_registers(p->thread.regs.regs.gp, p->thread.regs.regs.fp); - p->thread.request.u.thread.proc = args->fn; - p->thread.request.u.thread.arg = args->fn_arg; + p->thread.request.thread.proc = args->fn; + p->thread.request.thread.arg = args->fn_arg; handler = new_thread_handler; } @@ -206,6 +188,21 @@ void initial_thread_cb(void (*proc)(void *), void *arg) kmalloc_ok = save_kmalloc_ok; } +int arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src) +{ + /* init_task is not dynamically sized (missing FPU state) */ + if (unlikely(src == &init_task)) { + memcpy(dst, src, sizeof(init_task)); + memset((void *)dst + sizeof(init_task), 0, + arch_task_struct_size - sizeof(init_task)); + } else { + memcpy(dst, src, arch_task_struct_size); + } + + return 0; +} + void um_idle_sleep(void) { if (time_travel_mode != TT_MODE_OFF) @@ -216,7 +213,6 @@ void um_idle_sleep(void) void arch_cpu_idle(void) { - cpu_tasks[current_thread_info()->cpu].pid = os_getpid(); um_idle_sleep(); } @@ -225,14 +221,6 @@ int __uml_cant_sleep(void) { /* Is in_interrupt() really needed? */ } -int user_context(unsigned long sp) -{ - unsigned long stack; - - stack = sp & (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER); - return stack != (unsigned long) current_thread_info(); -} - extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end; void do_uml_exitcalls(void) @@ -250,88 +238,11 @@ char *uml_strdup(const char *string) } EXPORT_SYMBOL(uml_strdup); -int copy_to_user_proc(void __user *to, void *from, int size) -{ - return copy_to_user(to, from, size); -} - int copy_from_user_proc(void *to, void __user *from, int size) { return copy_from_user(to, from, size); } -int clear_user_proc(void __user *buf, int size) -{ - return clear_user(buf, size); -} - -static atomic_t using_sysemu = ATOMIC_INIT(0); -int sysemu_supported; - -void set_using_sysemu(int value) -{ - if (value > sysemu_supported) - return; - atomic_set(&using_sysemu, value); -} - -int get_using_sysemu(void) -{ - return atomic_read(&using_sysemu); -} - -static int sysemu_proc_show(struct seq_file *m, void *v) -{ - seq_printf(m, "%d\n", get_using_sysemu()); - return 0; -} - -static int sysemu_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, sysemu_proc_show, NULL); -} - -static ssize_t sysemu_proc_write(struct file *file, const char __user *buf, - size_t count, loff_t *pos) -{ - char tmp[2]; - - if (copy_from_user(tmp, buf, 1)) - return -EFAULT; - - if (tmp[0] >= '0' && tmp[0] <= '2') - set_using_sysemu(tmp[0] - '0'); - /* We use the first char, but pretend to write everything */ - return count; -} - -static const struct proc_ops sysemu_proc_ops = { - .proc_open = sysemu_proc_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = single_release, - .proc_write = sysemu_proc_write, -}; - -int __init make_proc_sysemu(void) -{ - struct proc_dir_entry *ent; - if (!sysemu_supported) - return 0; - - ent = proc_create("sysemu", 0600, NULL, &sysemu_proc_ops); - - if (ent == NULL) - { - printk(KERN_WARNING "Failed to register /proc/sysemu\n"); - return 0; - } - - return 0; -} - -late_initcall(make_proc_sysemu); - int singlestepping(void) { return test_thread_flag(TIF_SINGLESTEP); @@ -384,11 +295,3 @@ unsigned long __get_wchan(struct task_struct *p) return 0; } - -int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu) -{ - int cpu = current_thread_info()->cpu; - - return save_i387_registers(userspace_pid[cpu], (unsigned long *) fpu); -} - diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index 6600a2782796..2124624b7817 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -35,9 +35,6 @@ void ptrace_disable(struct task_struct *child) user_disable_single_step(child); } -extern int peek_user(struct task_struct * child, long addr, long data); -extern int poke_user(struct task_struct * child, long addr, long data); - long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c index 48c0610d506e..680bce4bd8fa 100644 --- a/arch/um/kernel/reboot.c +++ b/arch/um/kernel/reboot.c @@ -9,6 +9,7 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/oom.h> +#include <linux/reboot.h> #include <kern_util.h> #include <os.h> #include <skas.h> @@ -28,7 +29,7 @@ static void kill_off_processes(void) t = find_lock_task_mm(p); if (!t) continue; - pid = t->mm->context.id.u.pid; + pid = t->mm->context.id.pid; task_unlock(t); os_kill_ptraced_process(pid, 1); } @@ -58,3 +59,18 @@ void machine_halt(void) { machine_power_off(); } + +static int sys_power_off_handler(struct sys_off_data *data) +{ + machine_power_off(); + return 0; +} + +static int register_power_off(void) +{ + register_sys_off_handler(SYS_OFF_MODE_POWER_OFF, + SYS_OFF_PRIO_DEFAULT, + sys_power_off_handler, NULL); + return 0; +} +__initcall(register_power_off); diff --git a/arch/um/kernel/sigio.c b/arch/um/kernel/sigio.c index 5085a50c3b8c..4fc04742048a 100644 --- a/arch/um/kernel/sigio.c +++ b/arch/um/kernel/sigio.c @@ -8,32 +8,6 @@ #include <os.h> #include <sigio.h> -/* Protected by sigio_lock() called from write_sigio_workaround */ -static int sigio_irq_fd = -1; - -static irqreturn_t sigio_interrupt(int irq, void *data) -{ - char c; - - os_read_file(sigio_irq_fd, &c, sizeof(c)); - return IRQ_HANDLED; -} - -int write_sigio_irq(int fd) -{ - int err; - - err = um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt, - 0, "write sigio", NULL); - if (err < 0) { - printk(KERN_ERR "write_sigio_irq : um_request_irq failed, " - "err = %d\n", err); - return -1; - } - sigio_irq_fd = fd; - return 0; -} - /* These are called from os-Linux/sigio.c to protect its pollfds arrays. */ static DEFINE_MUTEX(sigio_mutex); diff --git a/arch/um/kernel/skas/.gitignore b/arch/um/kernel/skas/.gitignore new file mode 100644 index 000000000000..c3409ced0f38 --- /dev/null +++ b/arch/um/kernel/skas/.gitignore @@ -0,0 +1,2 @@ +stub_exe +stub_exe.dbg diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index f93972a25765..3384be42691f 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -3,15 +3,48 @@ # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) # -obj-y := clone.o mmu.o process.o syscall.o uaccess.o +obj-y := stub.o mmu.o process.o syscall.o uaccess.o \ + stub_exe_embed.o -# clone.o is in the stub, so it can't be built with profiling +# Stub executable + +stub_exe_objs-y := stub_exe.o + +stub_exe_objs := $(foreach F,$(stub_exe_objs-y),$(obj)/$F) + +# Object file containing the ELF executable +$(obj)/stub_exe_embed.o: $(src)/stub_exe_embed.S $(obj)/stub_exe + +$(obj)/stub_exe.dbg: $(stub_exe_objs) FORCE + $(call if_changed,stub_exe) + +$(obj)/stub_exe: OBJCOPYFLAGS := -S +$(obj)/stub_exe: $(obj)/stub_exe.dbg FORCE + $(call if_changed,objcopy) + +quiet_cmd_stub_exe = STUB_EXE $@ + cmd_stub_exe = $(CC) -nostdlib -o $@ \ + $(filter-out $(UM_GPROF_OPT) $(UM_GCOV_OPT),$(KBUILD_CFLAGS)) $(STUB_EXE_LDFLAGS) \ + $(filter %.o,$^) + +STUB_EXE_LDFLAGS = -Wl,-n -static + +targets += stub_exe.dbg stub_exe $(stub_exe_objs-y) + +# end + +# stub.o is in the stub, so it can't be built with profiling # GCC hardened also auto-enables -fpic, but we need %ebx so it can't work -> # disable it -CFLAGS_clone.o := $(CFLAGS_NO_HARDENING) -UNPROFILE_OBJS := clone.o +CFLAGS_stub.o := $(CFLAGS_NO_HARDENING) +CFLAGS_stub_exe.o := $(CFLAGS_NO_HARDENING) + +# Clang will call memset() from __builtin_alloca() when stack variable +# initialization is enabled, which is used in stub_exe.c. +CFLAGS_stub_exe.o += $(call cc-option, -ftrivial-auto-var-init=uninitialized) +UNPROFILE_OBJS := stub.o stub_exe.o KCOV_INSTRUMENT := n include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c deleted file mode 100644 index 62435187dda4..000000000000 --- a/arch/um/kernel/skas/clone.c +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include <signal.h> -#include <sched.h> -#include <asm/unistd.h> -#include <sys/time.h> -#include <as-layout.h> -#include <ptrace_user.h> -#include <stub-data.h> -#include <sysdep/stub.h> - -/* - * This is in a separate file because it needs to be compiled with any - * extraneous gcc flags (-pg, -fprofile-arcs, -ftest-coverage) disabled - * - * Use UM_KERN_PAGE_SIZE instead of PAGE_SIZE because that calls getpagesize - * on some systems. - */ - -void __attribute__ ((__section__ (".__syscall_stub"))) -stub_clone_handler(void) -{ - struct stub_data *data = get_stub_data(); - long err; - - err = stub_syscall2(__NR_clone, CLONE_PARENT | CLONE_FILES | SIGCHLD, - (unsigned long)data + - STUB_DATA_PAGES * UM_KERN_PAGE_SIZE / 2); - if (err) { - data->parent_err = err; - goto done; - } - - err = stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); - if (err) { - data->child_err = err; - goto done; - } - - remap_stack_and_trap(); - - done: - trap_myself(); -} diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 656fe16c9b63..0eb5a1d3ba70 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -10,14 +10,18 @@ #include <asm/pgalloc.h> #include <asm/sections.h> +#include <asm/mmu_context.h> #include <as-layout.h> #include <os.h> #include <skas.h> +#include <stub-data.h> + +/* Ensure the stub_data struct covers the allocated area */ +static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); int init_new_context(struct task_struct *task, struct mm_struct *mm) { - struct mm_context *from_mm = NULL; - struct mm_context *to_mm = &mm->context; + struct mm_id *new_id = &mm->context.id; unsigned long stack = 0; int ret = -ENOMEM; @@ -25,34 +29,24 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) if (stack == 0) goto out; - to_mm->id.stack = stack; - if (current->mm != NULL && current->mm != &init_mm) - from_mm = ¤t->mm->context; + new_id->stack = stack; block_signals_trace(); - if (from_mm) - to_mm->id.u.pid = copy_context_skas0(stack, - from_mm->id.u.pid); - else to_mm->id.u.pid = start_userspace(stack); + new_id->pid = start_userspace(stack); unblock_signals_trace(); - if (to_mm->id.u.pid < 0) { - ret = to_mm->id.u.pid; + if (new_id->pid < 0) { + ret = new_id->pid; goto out_free; } - ret = init_new_ldt(to_mm, from_mm); - if (ret < 0) { - printk(KERN_ERR "init_new_context_skas - init_ldt" - " failed, errno = %d\n", ret); - goto out_free; - } + /* Ensure the new MM is clean and nothing unwanted is mapped */ + unmap(new_id, 0, STUB_START); return 0; out_free: - if (to_mm->id.stack != 0) - free_pages(to_mm->id.stack, ilog2(STUB_DATA_PAGES)); + free_pages(new_id->stack, ilog2(STUB_DATA_PAGES)); out: return ret; } @@ -67,13 +61,12 @@ void destroy_context(struct mm_struct *mm) * whole UML suddenly dying. Also, cover negative and * 1 cases, since they shouldn't happen either. */ - if (mmu->id.u.pid < 2) { + if (mmu->id.pid < 2) { printk(KERN_ERR "corrupt mm_context - pid = %d\n", - mmu->id.u.pid); + mmu->id.pid); return; } - os_kill_ptraced_process(mmu->id.u.pid, 1); + os_kill_ptraced_process(mmu->id.pid, 1); free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES)); - free_ldt(mmu); } diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c index f2ac134c9752..05dcdc057af9 100644 --- a/arch/um/kernel/skas/process.c +++ b/arch/um/kernel/skas/process.c @@ -8,22 +8,19 @@ #include <linux/sched/task_stack.h> #include <linux/sched/task.h> +#include <asm/tlbflush.h> + #include <as-layout.h> #include <kern.h> #include <os.h> #include <skas.h> +#include <kern_util.h> extern void start_kernel(void); static int __init start_kernel_proc(void *unused) { - int pid; - block_signals_trace(); - pid = os_getpid(); - - cpu_tasks[0].pid = pid; - cpu_tasks[0].task = current; start_kernel(); return 0; @@ -31,7 +28,7 @@ static int __init start_kernel_proc(void *unused) extern int userspace_pid[]; -extern char cpu0_irqstack[]; +static char cpu0_irqstack[THREAD_SIZE] __aligned(THREAD_SIZE); int __init start_uml(void) { @@ -40,8 +37,8 @@ int __init start_uml(void) init_new_thread_signals(); - init_task.thread.request.u.thread.proc = start_kernel_proc; - init_task.thread.request.u.thread.arg = NULL; + init_task.thread.request.thread.proc = start_kernel_proc; + init_task.thread.request.thread.arg = NULL; return start_idle_thread(task_stack_page(&init_task), &init_task.thread.switch_buf); } @@ -53,3 +50,19 @@ unsigned long current_stub_stack(void) return current->mm->context.id.stack; } + +struct mm_id *current_mm_id(void) +{ + if (current->mm == NULL) + return NULL; + + return ¤t->mm->context.id; +} + +void current_mm_sync(void) +{ + if (current->mm == NULL) + return; + + um_tlb_sync(current->mm); +} diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c new file mode 100644 index 000000000000..796fc266d3bb --- /dev/null +++ b/arch/um/kernel/skas/stub.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> + */ + +#include <sysdep/stub.h> + +static __always_inline int syscall_handler(struct stub_data *d) +{ + int i; + unsigned long res; + + for (i = 0; i < d->syscall_data_len; i++) { + struct stub_syscall *sc = &d->syscall_data[i]; + + switch (sc->syscall) { + case STUB_SYSCALL_MMAP: + res = stub_syscall6(STUB_MMAP_NR, + sc->mem.addr, sc->mem.length, + sc->mem.prot, + MAP_SHARED | MAP_FIXED, + sc->mem.fd, sc->mem.offset); + if (res != sc->mem.addr) { + d->err = res; + d->syscall_data_len = i; + return -1; + } + break; + case STUB_SYSCALL_MUNMAP: + res = stub_syscall2(__NR_munmap, + sc->mem.addr, sc->mem.length); + if (res) { + d->err = res; + d->syscall_data_len = i; + return -1; + } + break; + default: + d->err = -95; /* EOPNOTSUPP */ + d->syscall_data_len = i; + return -1; + } + } + + d->err = 0; + d->syscall_data_len = 0; + + return 0; +} + +void __section(".__syscall_stub") +stub_syscall_handler(void) +{ + struct stub_data *d = get_stub_data(); + + syscall_handler(d); + + trap_myself(); +} diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c new file mode 100644 index 000000000000..23c99b285e82 --- /dev/null +++ b/arch/um/kernel/skas/stub_exe.c @@ -0,0 +1,95 @@ +#include <sys/ptrace.h> +#include <sys/prctl.h> +#include <asm/unistd.h> +#include <sysdep/stub.h> +#include <stub-data.h> + +void _start(void); + +noinline static void real_init(void) +{ + struct stub_init_data init_data; + unsigned long res; + struct { + void *ss_sp; + int ss_flags; + size_t ss_size; + } stack = { + .ss_size = STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, + }; + struct { + void *sa_handler_; + unsigned long sa_flags; + void *sa_restorer; + unsigned long long sa_mask; + } sa = { + /* Need to set SA_RESTORER (but the handler never returns) */ + .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000, + /* no need to mask any signals */ + .sa_mask = 0, + }; + + /* set a nice name */ + stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace"); + + /* Make sure this process dies if the kernel dies */ + stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL); + + /* read information from STDIN and close it */ + res = stub_syscall3(__NR_read, 0, + (unsigned long)&init_data, sizeof(init_data)); + if (res != sizeof(init_data)) + stub_syscall1(__NR_exit, 10); + + stub_syscall1(__NR_close, 0); + + /* map stub code + data */ + res = stub_syscall6(STUB_MMAP_NR, + init_data.stub_start, UM_KERN_PAGE_SIZE, + PROT_READ | PROT_EXEC, MAP_FIXED | MAP_SHARED, + init_data.stub_code_fd, init_data.stub_code_offset); + if (res != init_data.stub_start) + stub_syscall1(__NR_exit, 11); + + res = stub_syscall6(STUB_MMAP_NR, + init_data.stub_start + UM_KERN_PAGE_SIZE, + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, + PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, + init_data.stub_data_fd, init_data.stub_data_offset); + if (res != init_data.stub_start + UM_KERN_PAGE_SIZE) + stub_syscall1(__NR_exit, 12); + + /* setup signal stack inside stub data */ + stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE; + stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0); + + /* register SIGSEGV handler */ + sa.sa_handler_ = (void *) init_data.segv_handler; + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0, + sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 13); + + stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); + + stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); + + stub_syscall1(__NR_exit, 14); + + __builtin_unreachable(); +} + +__attribute__((naked)) void _start(void) +{ + /* + * Since the stack after exec() starts at the top-most address, + * but that's exactly where we also want to map the stub data + * and code, this must: + * - push the stack by 1 code and STUB_DATA_PAGES data pages + * - call real_init() + * This way, real_init() can use the stack normally, while the + * original stack further down (higher address) will become + * inaccessible after the mmap() calls above. + */ + stub_start(real_init); +} diff --git a/arch/um/kernel/skas/stub_exe_embed.S b/arch/um/kernel/skas/stub_exe_embed.S new file mode 100644 index 000000000000..6d8914fbe8f1 --- /dev/null +++ b/arch/um/kernel/skas/stub_exe_embed.S @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/init.h> +#include <linux/linkage.h> + +__INITDATA + +SYM_DATA_START(stub_exe_start) + .incbin "arch/um/kernel/skas/stub_exe" +SYM_DATA_END_LABEL(stub_exe_start, SYM_L_GLOBAL, stub_exe_end) + +__FINIT diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 9ee19e566da3..a5beaea2967e 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -12,23 +12,13 @@ #include <sysdep/syscalls.h> #include <linux/time-internal.h> #include <asm/unistd.h> +#include <asm/delay.h> void handle_syscall(struct uml_pt_regs *r) { struct pt_regs *regs = container_of(r, struct pt_regs, regs); int syscall; - /* - * If we have infinite CPU resources, then make every syscall also a - * preemption point, since we don't have any other preemption in this - * case, and kernel threads would basically never run until userspace - * went to sleep, even if said userspace interacts with the kernel in - * various ways. - */ - if (time_travel_mode == TT_MODE_INFCPU || - time_travel_mode == TT_MODE_EXTERNAL) - schedule(); - /* Initialize the syscall number and default return value. */ UPT_SYSCALL_NR(r) = PT_SYSCALL_NR(r->gp); PT_REGS_SET_SYSCALL_RETURN(regs, -ENOSYS); @@ -41,9 +31,36 @@ void handle_syscall(struct uml_pt_regs *r) goto out; syscall = UPT_SYSCALL_NR(r); - if (syscall >= 0 && syscall < __NR_syscalls) - PT_REGS_SET_SYSCALL_RETURN(regs, - EXECUTE_SYSCALL(syscall, regs)); + + /* + * If no time passes, then sched_yield may not actually yield, causing + * broken spinlock implementations in userspace (ASAN) to hang for long + * periods of time. + */ + if ((time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) && + syscall == __NR_sched_yield) + tt_extra_sched_jiffies += 1; + + if (syscall >= 0 && syscall < __NR_syscalls) { + unsigned long ret = EXECUTE_SYSCALL(syscall, regs); + + PT_REGS_SET_SYSCALL_RETURN(regs, ret); + + /* + * An error value here can be some form of -ERESTARTSYS + * and then we'd just loop. Make any error syscalls take + * some time, so that it won't just loop if something is + * not ready, and hopefully other things will make some + * progress. + */ + if (IS_ERR_VALUE(ret) && + (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL)) { + um_udelay(1); + schedule(); + } + } out: syscall_trace_leave(regs); diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 746715379f12..13ee5666668d 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -11,7 +11,6 @@ #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> -#include <asm/sysrq.h> #include <asm/stacktrace.h> #include <os.h> @@ -33,12 +32,6 @@ void show_stack(struct task_struct *task, unsigned long *stack, struct pt_regs *segv_regs = current->thread.segv_regs; int i; - if (!segv_regs && os_is_signal_stack()) { - pr_err("Received SIGSEGV in SIGSEGV handler," - " aborting stack trace!\n"); - return; - } - if (!stack) stack = get_stack_pointer(task, segv_regs); @@ -53,5 +46,5 @@ void show_stack(struct task_struct *task, unsigned long *stack, } printk("%sCall Trace:\n", loglvl); - dump_trace(current, &stackops, (void *)loglvl); + dump_trace(task ?: current, &stackops, (void *)loglvl); } diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 3e270da6b6f6..1394568c0210 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -19,17 +19,21 @@ #include <asm/param.h> #include <kern_util.h> #include <os.h> +#include <linux/delay.h> #include <linux/time-internal.h> #include <linux/um_timetravel.h> #include <shared/init.h> #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +#include <linux/sched/clock.h> + enum time_travel_mode time_travel_mode; EXPORT_SYMBOL_GPL(time_travel_mode); static bool time_travel_start_set; static unsigned long long time_travel_start; static unsigned long long time_travel_time; +static unsigned long long time_travel_shm_offset; static LIST_HEAD(time_travel_events); static LIST_HEAD(time_travel_irqs); static unsigned long long time_travel_timer_interval; @@ -39,8 +43,20 @@ static int time_travel_ext_fd = -1; static unsigned int time_travel_ext_waiting; static bool time_travel_ext_prev_request_valid; static unsigned long long time_travel_ext_prev_request; -static bool time_travel_ext_free_until_valid; -static unsigned long long time_travel_ext_free_until; +static unsigned long long *time_travel_ext_free_until; +static unsigned long long _time_travel_ext_free_until; +static u16 time_travel_shm_id; +static struct um_timetravel_schedshm *time_travel_shm; +static union um_timetravel_schedshm_client *time_travel_shm_client; + +unsigned long tt_extra_sched_jiffies; + +notrace unsigned long long sched_clock(void) +{ + return (unsigned long long)(jiffies - INITIAL_JIFFIES + + tt_extra_sched_jiffies) + * (NSEC_PER_SEC / HZ); +} static void time_travel_set_time(unsigned long long ns) { @@ -57,8 +73,52 @@ enum time_travel_message_handling { TTMH_IDLE, TTMH_POLL, TTMH_READ, + TTMH_READ_START_ACK, }; +static u64 bc_message; +int time_travel_should_print_bc_msg; + +void _time_travel_print_bc_msg(void) +{ + time_travel_should_print_bc_msg = 0; + printk(KERN_INFO "time-travel: received broadcast 0x%llx\n", bc_message); +} + +static void time_travel_setup_shm(int fd, u16 id) +{ + u32 len; + + time_travel_shm = os_mmap_rw_shared(fd, sizeof(*time_travel_shm)); + + if (!time_travel_shm) + goto out; + + len = time_travel_shm->len; + + if (time_travel_shm->version != UM_TIMETRAVEL_SCHEDSHM_VERSION || + len < struct_size(time_travel_shm, clients, id + 1)) { + os_unmap_memory(time_travel_shm, sizeof(*time_travel_shm)); + time_travel_shm = NULL; + goto out; + } + + time_travel_shm = os_mremap_rw_shared(time_travel_shm, + sizeof(*time_travel_shm), + len); + if (!time_travel_shm) + goto out; + + time_travel_shm_offset = time_travel_shm->current_time; + time_travel_shm_client = &time_travel_shm->clients[id]; + time_travel_shm_client->capa |= UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE; + time_travel_shm_id = id; + /* always look at that free_until from now on */ + time_travel_ext_free_until = &time_travel_shm->free_until; +out: + os_close_file(fd); +} + static void time_travel_handle_message(struct um_timetravel_msg *msg, enum time_travel_message_handling mode) { @@ -79,7 +139,20 @@ static void time_travel_handle_message(struct um_timetravel_msg *msg, } } - ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); + if (unlikely(mode == TTMH_READ_START_ACK)) { + int fd[UM_TIMETRAVEL_SHARED_MAX_FDS]; + + ret = os_rcv_fd_msg(time_travel_ext_fd, fd, + ARRAY_SIZE(fd), msg, sizeof(*msg)); + if (ret == sizeof(*msg)) { + time_travel_setup_shm(fd[UM_TIMETRAVEL_SHARED_MEMFD], + msg->time & UM_TIMETRAVEL_START_ACK_ID); + /* we don't use the logging for now */ + os_close_file(fd[UM_TIMETRAVEL_SHARED_LOGFD]); + } + } else { + ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); + } if (ret == 0) panic("time-travel external link is broken\n"); @@ -95,10 +168,24 @@ static void time_travel_handle_message(struct um_timetravel_msg *msg, return; case UM_TIMETRAVEL_RUN: time_travel_set_time(msg->time); + if (time_travel_shm) { + /* no request right now since we're running */ + time_travel_shm_client->flags &= + ~UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN; + /* no ack for shared memory RUN */ + return; + } break; case UM_TIMETRAVEL_FREE_UNTIL: - time_travel_ext_free_until_valid = true; - time_travel_ext_free_until = msg->time; + /* not supposed to get this with shm, but ignore it */ + if (time_travel_shm) + break; + time_travel_ext_free_until = &_time_travel_ext_free_until; + _time_travel_ext_free_until = msg->time; + break; + case UM_TIMETRAVEL_BROADCAST: + bc_message = msg->time; + time_travel_should_print_bc_msg = 1; break; } @@ -135,8 +222,15 @@ static u64 time_travel_ext_req(u32 op, u64 time) block_signals_hard(); os_write_file(time_travel_ext_fd, &msg, sizeof(msg)); + /* no ACK expected for WAIT in shared memory mode */ + if (msg.op == UM_TIMETRAVEL_WAIT && time_travel_shm) + goto done; + while (msg.op != UM_TIMETRAVEL_ACK) - time_travel_handle_message(&msg, TTMH_READ); + time_travel_handle_message(&msg, + op == UM_TIMETRAVEL_START ? + TTMH_READ_START_ACK : + TTMH_READ); if (msg.seq != mseq) panic("time-travel: ACK message has different seqno! op=%d, seq=%d != %d time=%lld\n", @@ -144,6 +238,7 @@ static u64 time_travel_ext_req(u32 op, u64 time) if (op == UM_TIMETRAVEL_GET) time_travel_set_time(msg.time); +done: unblock_signals_hard(); return msg.time; @@ -179,13 +274,33 @@ static void time_travel_ext_update_request(unsigned long long time) /* * if we're running and are allowed to run past the request * then we don't need to update it either + * + * Note for shm we ignore FREE_UNTIL messages and leave the pointer + * to shared memory, and for non-shm the offset is 0. */ - if (!time_travel_ext_waiting && time_travel_ext_free_until_valid && - time < time_travel_ext_free_until) + if (!time_travel_ext_waiting && time_travel_ext_free_until && + time < (*time_travel_ext_free_until - time_travel_shm_offset)) return; time_travel_ext_prev_request = time; time_travel_ext_prev_request_valid = true; + + if (time_travel_shm) { + union um_timetravel_schedshm_client *running; + + running = &time_travel_shm->clients[time_travel_shm->running_id]; + + if (running->capa & UM_TIMETRAVEL_SCHEDSHM_CAP_TIME_SHARE) { + time_travel_shm_client->flags |= + UM_TIMETRAVEL_SCHEDSHM_FLAGS_REQ_RUN; + time += time_travel_shm_offset; + time_travel_shm_client->req_time = time; + if (time < time_travel_shm->free_until) + time_travel_shm->free_until = time; + return; + } + } + time_travel_ext_req(UM_TIMETRAVEL_REQUEST, time); } @@ -193,6 +308,14 @@ void __time_travel_propagate_time(void) { static unsigned long long last_propagated; + if (time_travel_shm) { + if (time_travel_shm->running_id != time_travel_shm_id) + panic("time-travel: setting time while not running\n"); + time_travel_shm->current_time = time_travel_time + + time_travel_shm_offset; + return; + } + if (last_propagated == time_travel_time) return; @@ -208,9 +331,12 @@ static bool time_travel_ext_request(unsigned long long time) * If we received an external sync point ("free until") then we * don't have to request/wait for anything until then, unless * we're already waiting. + * + * Note for shm we ignore FREE_UNTIL messages and leave the pointer + * to shared memory, and for non-shm the offset is 0. */ - if (!time_travel_ext_waiting && time_travel_ext_free_until_valid && - time < time_travel_ext_free_until) + if (!time_travel_ext_waiting && time_travel_ext_free_until && + time < (*time_travel_ext_free_until - time_travel_shm_offset)) return false; time_travel_ext_update_request(time); @@ -224,7 +350,8 @@ static void time_travel_ext_wait(bool idle) }; time_travel_ext_prev_request_valid = false; - time_travel_ext_free_until_valid = false; + if (!time_travel_shm) + time_travel_ext_free_until = NULL; time_travel_ext_waiting++; time_travel_ext_req(UM_TIMETRAVEL_WAIT, -1); @@ -247,7 +374,11 @@ static void time_travel_ext_wait(bool idle) static void time_travel_ext_get_time(void) { - time_travel_ext_req(UM_TIMETRAVEL_GET, -1); + if (time_travel_shm) + time_travel_set_time(time_travel_shm->current_time - + time_travel_shm_offset); + else + time_travel_ext_req(UM_TIMETRAVEL_GET, -1); } static void __time_travel_update_time(unsigned long long ns, bool idle) @@ -319,10 +450,15 @@ void time_travel_add_event_rel(struct time_travel_event *e, time_travel_add_event(e, time_travel_time + delay_ns); } -void time_travel_periodic_timer(struct time_travel_event *e) +static void time_travel_periodic_timer(struct time_travel_event *e) { time_travel_add_event(&time_travel_timer_event, time_travel_time + time_travel_timer_interval); + + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ + if (tt_extra_sched_jiffies > 0) + tt_extra_sched_jiffies -= 1; + deliver_alarm(); } @@ -474,6 +610,10 @@ EXPORT_SYMBOL_GPL(time_travel_add_irq_event); static void time_travel_oneshot_timer(struct time_travel_event *e) { + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ + if (tt_extra_sched_jiffies > 0) + tt_extra_sched_jiffies -= 1; + deliver_alarm(); } @@ -719,7 +859,7 @@ static irqreturn_t um_timer(int irq, void *dev) if (get_current()->mm != NULL) { /* userspace - relay signal, results in correct userspace timers */ - os_alarm_process(get_current()->mm->context.id.u.pid); + os_alarm_process(get_current()->mm->context.id.pid); } (*timer_clockevent.event_handler)(&timer_clockevent); @@ -812,7 +952,7 @@ unsigned long calibrate_delay_is_known(void) return 0; } -int setup_time_travel(char *str) +static int setup_time_travel(char *str) { if (strcmp(str, "=inf-cpu") == 0) { time_travel_mode = TT_MODE_INFCPU; @@ -862,7 +1002,7 @@ __uml_help(setup_time_travel, "devices using it, assuming the device has the right capabilities.\n" "The optional ID is a 64-bit integer that's sent to the central scheduler.\n"); -int setup_time_travel_start(char *str) +static int setup_time_travel_start(char *str) { int err; @@ -874,9 +1014,49 @@ int setup_time_travel_start(char *str) return 1; } -__setup("time-travel-start", setup_time_travel_start); +__setup("time-travel-start=", setup_time_travel_start); __uml_help(setup_time_travel_start, -"time-travel-start=<seconds>\n" +"time-travel-start=<nanoseconds>\n" "Configure the UML instance's wall clock to start at this value rather than\n" "the host's wall clock at the time of UML boot.\n"); +static struct kobject *bc_time_kobject; + +static ssize_t bc_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "0x%llx", bc_message); +} + +static ssize_t bc_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) +{ + int ret; + u64 user_bc_message; + + ret = kstrtou64(buf, 0, &user_bc_message); + if (ret) + return ret; + + bc_message = user_bc_message; + + time_travel_ext_req(UM_TIMETRAVEL_BROADCAST, bc_message); + pr_info("um: time: sent broadcast message: 0x%llx\n", bc_message); + return count; +} + +static struct kobj_attribute bc_attribute = __ATTR(bc-message, 0660, bc_show, bc_store); + +static int __init um_bc_start(void) +{ + if (time_travel_mode != TT_MODE_EXTERNAL) + return 0; + + bc_time_kobject = kobject_create_and_add("um-ext-time", kernel_kobj); + if (!bc_time_kobject) + return 0; + + if (sysfs_create_file(bc_time_kobject, &bc_attribute.attr)) + pr_debug("failed to create the bc file in /sys/kernel/um_time"); + + return 0; +} +late_initcall(um_bc_start); #endif diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 7d050ab0f78a..cf7e0d4407f2 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -8,241 +8,82 @@ #include <linux/sched/signal.h> #include <asm/tlbflush.h> +#include <asm/mmu_context.h> #include <as-layout.h> #include <mem_user.h> #include <os.h> #include <skas.h> #include <kern_util.h> -struct host_vm_change { - struct host_vm_op { - enum { NONE, MMAP, MUNMAP, MPROTECT } type; - union { - struct { - unsigned long addr; - unsigned long len; - unsigned int prot; - int fd; - __u64 offset; - } mmap; - struct { - unsigned long addr; - unsigned long len; - } munmap; - struct { - unsigned long addr; - unsigned long len; - unsigned int prot; - } mprotect; - } u; - } ops[1]; - int userspace; - int index; - struct mm_struct *mm; - void *data; - int force; -}; - -#define INIT_HVC(mm, force, userspace) \ - ((struct host_vm_change) \ - { .ops = { { .type = NONE } }, \ - .mm = mm, \ - .data = NULL, \ - .userspace = userspace, \ - .index = 0, \ - .force = force }) - -static void report_enomem(void) -{ - printk(KERN_ERR "UML ran out of memory on the host side! " - "This can happen due to a memory limitation or " - "vm.max_map_count has been reached.\n"); -} - -static int do_ops(struct host_vm_change *hvc, int end, - int finished) -{ - struct host_vm_op *op; - int i, ret = 0; +struct vm_ops { + struct mm_id *mm_idp; - for (i = 0; i < end && !ret; i++) { - op = &hvc->ops[i]; - switch (op->type) { - case MMAP: - if (hvc->userspace) - ret = map(&hvc->mm->context.id, op->u.mmap.addr, - op->u.mmap.len, op->u.mmap.prot, - op->u.mmap.fd, - op->u.mmap.offset, finished, - &hvc->data); - else - map_memory(op->u.mmap.addr, op->u.mmap.offset, - op->u.mmap.len, 1, 1, 1); - break; - case MUNMAP: - if (hvc->userspace) - ret = unmap(&hvc->mm->context.id, - op->u.munmap.addr, - op->u.munmap.len, finished, - &hvc->data); - else - ret = os_unmap_memory( - (void *) op->u.munmap.addr, - op->u.munmap.len); - - break; - case MPROTECT: - if (hvc->userspace) - ret = protect(&hvc->mm->context.id, - op->u.mprotect.addr, - op->u.mprotect.len, - op->u.mprotect.prot, - finished, &hvc->data); - else - ret = os_protect_memory( - (void *) op->u.mprotect.addr, - op->u.mprotect.len, - 1, 1, 1); - break; - default: - printk(KERN_ERR "Unknown op type %d in do_ops\n", - op->type); - BUG(); - break; - } - } - - if (ret == -ENOMEM) - report_enomem(); - - return ret; -} + int (*mmap)(struct mm_id *mm_idp, + unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset); + int (*unmap)(struct mm_id *mm_idp, + unsigned long virt, unsigned long len); +}; -static int add_mmap(unsigned long virt, unsigned long phys, unsigned long len, - unsigned int prot, struct host_vm_change *hvc) +static int kern_map(struct mm_id *mm_idp, + unsigned long virt, unsigned long len, int prot, + int phys_fd, unsigned long long offset) { - __u64 offset; - struct host_vm_op *last; - int fd = -1, ret = 0; - - if (hvc->userspace) - fd = phys_mapping(phys, &offset); - else - offset = phys; - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MMAP) && - (last->u.mmap.addr + last->u.mmap.len == virt) && - (last->u.mmap.prot == prot) && (last->u.mmap.fd == fd) && - (last->u.mmap.offset + last->u.mmap.len == offset)) { - last->u.mmap.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MMAP, - .u = { .mmap = { .addr = virt, - .len = len, - .prot = prot, - .fd = fd, - .offset = offset } - } }); - return ret; + /* TODO: Why is executable needed to be always set in the kernel? */ + return os_map_memory((void *)virt, phys_fd, offset, len, + prot & UM_PROT_READ, prot & UM_PROT_WRITE, + 1); } -static int add_munmap(unsigned long addr, unsigned long len, - struct host_vm_change *hvc) +static int kern_unmap(struct mm_id *mm_idp, + unsigned long virt, unsigned long len) { - struct host_vm_op *last; - int ret = 0; - - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MUNMAP) && - (last->u.munmap.addr + last->u.mmap.len == addr)) { - last->u.munmap.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MUNMAP, - .u = { .munmap = { .addr = addr, - .len = len } } }); - return ret; + return os_unmap_memory((void *)virt, len); } -static int add_mprotect(unsigned long addr, unsigned long len, - unsigned int prot, struct host_vm_change *hvc) +void report_enomem(void) { - struct host_vm_op *last; - int ret = 0; - - if (hvc->index != 0) { - last = &hvc->ops[hvc->index - 1]; - if ((last->type == MPROTECT) && - (last->u.mprotect.addr + last->u.mprotect.len == addr) && - (last->u.mprotect.prot == prot)) { - last->u.mprotect.len += len; - return 0; - } - } - - if (hvc->index == ARRAY_SIZE(hvc->ops)) { - ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0); - hvc->index = 0; - } - - hvc->ops[hvc->index++] = ((struct host_vm_op) - { .type = MPROTECT, - .u = { .mprotect = { .addr = addr, - .len = len, - .prot = prot } } }); - return ret; + printk(KERN_ERR "UML ran out of memory on the host side! " + "This can happen due to a memory limitation or " + "vm.max_map_count has been reached.\n"); } -#define ADD_ROUND(n, inc) (((n) + (inc)) & ~((inc) - 1)) - static inline int update_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pte_t *pte; - int r, w, x, prot, ret = 0; + int ret = 0; pte = pte_offset_kernel(pmd, addr); do { - r = pte_read(*pte); - w = pte_write(*pte); - x = pte_exec(*pte); - if (!pte_young(*pte)) { - r = 0; - w = 0; - } else if (!pte_dirty(*pte)) - w = 0; + if (!pte_needsync(*pte)) + continue; + + if (pte_present(*pte)) { + __u64 offset; + unsigned long phys = pte_val(*pte) & PAGE_MASK; + int fd = phys_mapping(phys, &offset); + int r, w, x, prot; + + r = pte_read(*pte); + w = pte_write(*pte); + x = pte_exec(*pte); + if (!pte_young(*pte)) { + r = 0; + w = 0; + } else if (!pte_dirty(*pte)) + w = 0; + + prot = (r ? UM_PROT_READ : 0) | + (w ? UM_PROT_WRITE : 0) | + (x ? UM_PROT_EXEC : 0); + + ret = ops->mmap(ops->mm_idp, addr, PAGE_SIZE, + prot, fd, offset); + } else + ret = ops->unmap(ops->mm_idp, addr, PAGE_SIZE); - prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | - (x ? UM_PROT_EXEC : 0)); - if (hvc->force || pte_newpage(*pte)) { - if (pte_present(*pte)) { - if (pte_newpage(*pte)) - ret = add_mmap(addr, pte_val(*pte) & PAGE_MASK, - PAGE_SIZE, prot, hvc); - } else - ret = add_munmap(addr, PAGE_SIZE, hvc); - } else if (pte_newprot(*pte)) - ret = add_mprotect(addr, PAGE_SIZE, prot, hvc); *pte = pte_mkuptodate(*pte); } while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret)); return ret; @@ -250,7 +91,7 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr, static inline int update_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pmd_t *pmd; unsigned long next; @@ -260,19 +101,20 @@ static inline int update_pmd_range(pud_t *pud, unsigned long addr, do { next = pmd_addr_end(addr, end); if (!pmd_present(*pmd)) { - if (hvc->force || pmd_newpage(*pmd)) { - ret = add_munmap(addr, next - addr, hvc); + if (pmd_needsync(*pmd)) { + ret = ops->unmap(ops->mm_idp, addr, + next - addr); pmd_mkuptodate(*pmd); } } - else ret = update_pte_range(pmd, addr, next, hvc); + else ret = update_pte_range(pmd, addr, next, ops); } while (pmd++, addr = next, ((addr < end) && !ret)); return ret; } static inline int update_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { pud_t *pud; unsigned long next; @@ -282,19 +124,20 @@ static inline int update_pud_range(p4d_t *p4d, unsigned long addr, do { next = pud_addr_end(addr, end); if (!pud_present(*pud)) { - if (hvc->force || pud_newpage(*pud)) { - ret = add_munmap(addr, next - addr, hvc); + if (pud_needsync(*pud)) { + ret = ops->unmap(ops->mm_idp, addr, + next - addr); pud_mkuptodate(*pud); } } - else ret = update_pmd_range(pud, addr, next, hvc); + else ret = update_pmd_range(pud, addr, next, ops); } while (pud++, addr = next, ((addr < end) && !ret)); return ret; } static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, - struct host_vm_change *hvc) + struct vm_ops *ops) { p4d_t *p4d; unsigned long next; @@ -304,227 +147,57 @@ static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, do { next = p4d_addr_end(addr, end); if (!p4d_present(*p4d)) { - if (hvc->force || p4d_newpage(*p4d)) { - ret = add_munmap(addr, next - addr, hvc); + if (p4d_needsync(*p4d)) { + ret = ops->unmap(ops->mm_idp, addr, + next - addr); p4d_mkuptodate(*p4d); } } else - ret = update_pud_range(p4d, addr, next, hvc); + ret = update_pud_range(p4d, addr, next, ops); } while (p4d++, addr = next, ((addr < end) && !ret)); return ret; } -static void fix_range_common(struct mm_struct *mm, unsigned long start_addr, - unsigned long end_addr, int force) +int um_tlb_sync(struct mm_struct *mm) { pgd_t *pgd; - struct host_vm_change hvc; - unsigned long addr = start_addr, next; - int ret = 0, userspace = 1; + struct vm_ops ops; + unsigned long addr = mm->context.sync_tlb_range_from, next; + int ret = 0; + + if (mm->context.sync_tlb_range_to == 0) + return 0; + + ops.mm_idp = &mm->context.id; + if (mm == &init_mm) { + ops.mmap = kern_map; + ops.unmap = kern_unmap; + } else { + ops.mmap = map; + ops.unmap = unmap; + } - hvc = INIT_HVC(mm, force, userspace); pgd = pgd_offset(mm, addr); do { - next = pgd_addr_end(addr, end_addr); + next = pgd_addr_end(addr, mm->context.sync_tlb_range_to); if (!pgd_present(*pgd)) { - if (force || pgd_newpage(*pgd)) { - ret = add_munmap(addr, next - addr, &hvc); + if (pgd_needsync(*pgd)) { + ret = ops.unmap(ops.mm_idp, addr, + next - addr); pgd_mkuptodate(*pgd); } } else - ret = update_p4d_range(pgd, addr, next, &hvc); - } while (pgd++, addr = next, ((addr < end_addr) && !ret)); - - if (!ret) - ret = do_ops(&hvc, hvc.index, 1); + ret = update_p4d_range(pgd, addr, next, &ops); + } while (pgd++, addr = next, + ((addr < mm->context.sync_tlb_range_to) && !ret)); - /* This is not an else because ret is modified above */ - if (ret) { - struct mm_id *mm_idp = ¤t->mm->context.id; - - printk(KERN_ERR "fix_range_common: failed, killing current " - "process: %d\n", task_tgid_vnr(current)); - mm_idp->kill = 1; - } -} - -static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end) -{ - struct mm_struct *mm; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long addr, last; - int updated = 0, err = 0, force = 0, userspace = 0; - struct host_vm_change hvc; - - mm = &init_mm; - hvc = INIT_HVC(mm, force, userspace); - for (addr = start; addr < end;) { - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) { - last = ADD_ROUND(addr, PGDIR_SIZE); - if (last > end) - last = end; - if (pgd_newpage(*pgd)) { - updated = 1; - err = add_munmap(addr, last - addr, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) { - last = ADD_ROUND(addr, P4D_SIZE); - if (last > end) - last = end; - if (p4d_newpage(*p4d)) { - updated = 1; - err = add_munmap(addr, last - addr, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) { - last = ADD_ROUND(addr, PUD_SIZE); - if (last > end) - last = end; - if (pud_newpage(*pud)) { - updated = 1; - err = add_munmap(addr, last - addr, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - last = ADD_ROUND(addr, PMD_SIZE); - if (last > end) - last = end; - if (pmd_newpage(*pmd)) { - updated = 1; - err = add_munmap(addr, last - addr, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - } - addr = last; - continue; - } - - pte = pte_offset_kernel(pmd, addr); - if (!pte_present(*pte) || pte_newpage(*pte)) { - updated = 1; - err = add_munmap(addr, PAGE_SIZE, &hvc); - if (err < 0) - panic("munmap failed, errno = %d\n", - -err); - if (pte_present(*pte)) - err = add_mmap(addr, pte_val(*pte) & PAGE_MASK, - PAGE_SIZE, 0, &hvc); - } - else if (pte_newprot(*pte)) { - updated = 1; - err = add_mprotect(addr, PAGE_SIZE, 0, &hvc); - } - addr += PAGE_SIZE; - } - if (!err) - err = do_ops(&hvc, hvc.index, 1); - - if (err < 0) - panic("flush_tlb_kernel failed, errno = %d\n", err); - return updated; -} - -void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) -{ - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - struct mm_struct *mm = vma->vm_mm; - void *flush = NULL; - int r, w, x, prot, err = 0; - struct mm_id *mm_id; - - address &= PAGE_MASK; - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - goto kill; - - p4d = p4d_offset(pgd, address); - if (!p4d_present(*p4d)) - goto kill; - - pud = pud_offset(p4d, address); - if (!pud_present(*pud)) - goto kill; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - goto kill; - - pte = pte_offset_kernel(pmd, address); - - r = pte_read(*pte); - w = pte_write(*pte); - x = pte_exec(*pte); - if (!pte_young(*pte)) { - r = 0; - w = 0; - } else if (!pte_dirty(*pte)) { - w = 0; - } - - mm_id = &mm->context.id; - prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | - (x ? UM_PROT_EXEC : 0)); - if (pte_newpage(*pte)) { - if (pte_present(*pte)) { - unsigned long long offset; - int fd; - - fd = phys_mapping(pte_val(*pte) & PAGE_MASK, &offset); - err = map(mm_id, address, PAGE_SIZE, prot, fd, offset, - 1, &flush); - } - else err = unmap(mm_id, address, PAGE_SIZE, 1, &flush); - } - else if (pte_newprot(*pte)) - err = protect(mm_id, address, PAGE_SIZE, prot, 1, &flush); - - if (err) { - if (err == -ENOMEM) - report_enomem(); - - goto kill; - } - - *pte = pte_mkuptodate(*pte); + if (ret == -ENOMEM) + report_enomem(); - return; + mm->context.sync_tlb_range_from = 0; + mm->context.sync_tlb_range_to = 0; -kill: - printk(KERN_ERR "Failed to flush page for address 0x%lx\n", address); - force_sig(SIGKILL); + return ret; } void flush_tlb_all(void) @@ -539,66 +212,11 @@ void flush_tlb_all(void) flush_tlb_mm(current->mm); } -void flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ - flush_tlb_kernel_range_common(start, end); -} - -void flush_tlb_kernel_vm(void) -{ - flush_tlb_kernel_range_common(start_vm, end_vm); -} - -void __flush_tlb_one(unsigned long addr) -{ - flush_tlb_kernel_range_common(addr, addr + PAGE_SIZE); -} - -static void fix_range(struct mm_struct *mm, unsigned long start_addr, - unsigned long end_addr, int force) -{ - /* - * Don't bother flushing if this address space is about to be - * destroyed. - */ - if (atomic_read(&mm->mm_users) == 0) - return; - - fix_range_common(mm, start_addr, end_addr, force); -} - -void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - if (vma->vm_mm == NULL) - flush_tlb_kernel_range_common(start, end); - else fix_range(vma->vm_mm, start, end, 0); -} -EXPORT_SYMBOL(flush_tlb_range); - -void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end) -{ - fix_range(mm, start, end, 0); -} - void flush_tlb_mm(struct mm_struct *mm) { struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); for_each_vma(vmi, vma) - fix_range(mm, vma->vm_start, vma->vm_end, 0); -} - -void force_flush_all(void) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, mm, 0); - - mmap_read_lock(mm); - for_each_vma(vmi, vma) - fix_range(mm, vma->vm_start, vma->vm_end, 1); - mmap_read_unlock(mm); + um_tlb_mark_sync(mm, vma->vm_start, vma->vm_end); } diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 6d8ae86ae978..ce073150dc20 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -16,6 +16,7 @@ #include <kern_util.h> #include <os.h> #include <skas.h> +#include <arch.h> /* * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by @@ -113,7 +114,7 @@ good_area: #if 0 WARN_ON(!pte_young(*pte) || (is_write && !pte_dirty(*pte))); #endif - flush_tlb_page(vma, address); + out: mmap_read_unlock(mm); out_nosemaphore: @@ -175,12 +176,14 @@ void fatal_sigsegv(void) * @sig: the signal number * @unused_si: the signal info struct; unused in this handler * @regs: the ptrace register information + * @mc: the mcontext of the signal * * The handler first extracts the faultinfo from the UML ptrace regs struct. * If the userfault did not happen in an UML userspace process, bad_segv is called. * Otherwise the signal did happen in a cloned userspace process, handle it. */ -void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { struct faultinfo * fi = UPT_FAULTINFO(regs); @@ -189,7 +192,7 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) bad_segv(*fi, UPT_IP(regs)); return; } - segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs); + segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc); } /* @@ -199,9 +202,8 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) * give us bad data! */ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, - struct uml_pt_regs *regs) + struct uml_pt_regs *regs, void *mc) { - jmp_buf *catcher; int si_code; int err; int is_write = FAULT_WRITE(fi); @@ -210,11 +212,33 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, if (!is_user && regs) current->thread.segv_regs = container_of(regs, struct pt_regs, regs); - if (!is_user && (address >= start_vm) && (address < end_vm)) { - flush_tlb_kernel_vm(); + if (!is_user && init_mm.context.sync_tlb_range_to) { + /* + * Kernel has pending updates from set_ptes that were not + * flushed yet. Syncing them should fix the pagefault (if not + * we'll get here again and panic). + */ + err = um_tlb_sync(&init_mm); + if (err == -ENOMEM) + report_enomem(); + if (err) + panic("Failed to sync kernel TLBs: %d", err); goto out; } else if (current->mm == NULL) { + if (current->pagefault_disabled) { + if (!mc) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault with pagefaults disabled but no mcontext"); + } + if (!current->thread.segv_continue) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault without recovery target"); + } + mc_set_rip(mc, current->thread.segv_continue); + current->thread.segv_continue = NULL; + goto out; + } show_regs(container_of(regs, struct pt_regs, regs)); panic("Segfault with no mm"); } @@ -237,15 +261,8 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, address = 0; } - catcher = current->thread.fault_catcher; if (!err) goto out; - else if (catcher != NULL) { - current->thread.fault_addr = (void *) address; - UML_LONGJMP(catcher, 1); - } - else if (current->thread.fault_addr != NULL) - panic("fault_addr set but no fault catcher"); else if (!is_user && arch_fixup(ip, regs)) goto out; @@ -273,7 +290,8 @@ out: return 0; } -void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) +void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs, + void *mc) { int code, err; if (!UPT_IS_USER(regs)) { @@ -301,15 +319,8 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) } } -void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs) -{ - if (current->thread.fault_catcher != NULL) - UML_LONGJMP(current->thread.fault_catcher, 1); - else - relay_signal(sig, si, regs); -} - -void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { do_IRQ(WINCH_IRQ, regs); } diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 7a9820797eae..d4b3b6742ec8 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -12,6 +12,7 @@ #include <linux/panic_notifier.h> #include <linux/seq_file.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/utsname.h> #include <linux/sched.h> #include <linux/sched/task.h> @@ -23,6 +24,7 @@ #include <asm/cpufeature.h> #include <asm/sections.h> #include <asm/setup.h> +#include <asm/text-patching.h> #include <as-layout.h> #include <arch.h> #include <init.h> @@ -64,9 +66,6 @@ struct cpuinfo_um boot_cpu_data = { EXPORT_SYMBOL(boot_cpu_data); -union thread_union cpu0_irqstack - __section(".data..init_irqstack") = - { .thread_info = INIT_THREAD_INFO(init_task) }; /* Changed in setup_arch, which is called in early boot */ static char host_info[(__NEW_UTS_LEN + 1) * 5]; @@ -80,7 +79,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "model name\t: UML\n"); seq_printf(m, "mode\t\t: skas\n"); seq_printf(m, "host\t\t: %s\n", host_info); - seq_printf(m, "fpu\t\t: %s\n", cpu_has(&boot_cpu_data, X86_FEATURE_FPU) ? "yes" : "no"); + seq_printf(m, "fpu\t\t: %s\n", str_yes_no(cpu_has(&boot_cpu_data, X86_FEATURE_FPU))); seq_printf(m, "flags\t\t:"); for (i = 0; i < 32*NCAPINTS; i++) if (cpu_has(&boot_cpu_data, i) && (x86_cap_flags[i] != NULL)) @@ -125,15 +124,12 @@ unsigned long uml_reserved; /* Also modified in mem_init */ unsigned long start_vm; unsigned long end_vm; -/* Set in uml_ncpus_setup */ -int ncpus = 1; - /* Set in early boot */ static int have_root __initdata; static int have_console __initdata; /* Set in uml_mem_setup and modified in linux_main */ -long long physmem_size = 64 * 1024 * 1024; +unsigned long long physmem_size = 64 * 1024 * 1024; EXPORT_SYMBOL(physmem_size); static const char *usage_string = @@ -169,19 +165,6 @@ __uml_setup("root=", uml_root_setup, " root=/dev/ubd5\n\n" ); -static int __init no_skas_debug_setup(char *line, int *add) -{ - os_warn("'debug' is not necessary to gdb UML in skas mode - run\n"); - os_warn("'gdb linux'\n"); - - return 0; -} - -__uml_setup("debug", no_skas_debug_setup, -"debug\n" -" this flag is not needed to run gdb on UML in skas mode\n\n" -); - static int __init uml_console_setup(char *line, int *add) { have_console = 1; @@ -259,6 +242,8 @@ static struct notifier_block panic_exit_notifier = { void uml_finishsetup(void) { + cpu_tasks[0] = &init_task; + atomic_notifier_chain_register(&panic_notifier_list, &panic_exit_notifier); @@ -280,7 +265,7 @@ EXPORT_SYMBOL(end_iomem); #define MIN_VMALLOC (32 * 1024 * 1024) -static void parse_host_cpu_flags(char *line) +static void __init parse_host_cpu_flags(char *line) { int i; for (i = 0; i < 32*NCAPINTS; i++) { @@ -288,7 +273,8 @@ static void parse_host_cpu_flags(char *line) set_cpu_cap(&boot_cpu_data, i); } } -static void parse_cache_line(char *line) + +static void __init parse_cache_line(char *line) { long res; char *to_parse = strstr(line, ":"); @@ -304,7 +290,24 @@ static void parse_cache_line(char *line) } } -int __init linux_main(int argc, char **argv) +static unsigned long __init get_top_address(char **envp) +{ + unsigned long top_addr = (unsigned long) &top_addr; + int i; + + /* The earliest variable should be after the program name in ELF */ + for (i = 0; envp[i]; i++) { + if ((unsigned long) envp[i] > top_addr) + top_addr = (unsigned long) envp[i]; + } + + top_addr &= ~(UM_KERN_PAGE_SIZE - 1); + top_addr += UM_KERN_PAGE_SIZE; + + return top_addr; +} + +int __init linux_main(int argc, char **argv, char **envp) { unsigned long avail, diff; unsigned long virtmem_size, max_physmem; @@ -326,20 +329,23 @@ int __init linux_main(int argc, char **argv) if (have_console == 0) add_arg(DEFAULT_COMMAND_LINE_CONSOLE); - host_task_size = os_get_top_address(); - /* reserve a few pages for the stubs (taking care of data alignment) */ - /* align the data portion */ - BUILD_BUG_ON(!is_power_of_2(STUB_DATA_PAGES)); - stub_start = (host_task_size - 1) & ~(STUB_DATA_PAGES * PAGE_SIZE - 1); + host_task_size = get_top_address(envp); + /* reserve a few pages for the stubs */ + stub_start = host_task_size - STUB_DATA_PAGES * PAGE_SIZE; /* another page for the code portion */ stub_start -= PAGE_SIZE; host_task_size = stub_start; + /* Limit TASK_SIZE to what is addressable by the page table */ + task_size = host_task_size; + if (task_size > (unsigned long long) PTRS_PER_PGD * PGDIR_SIZE) + task_size = PTRS_PER_PGD * PGDIR_SIZE; + /* * TASK_SIZE needs to be PGDIR_SIZE aligned or else exit_mmap craps * out */ - task_size = host_task_size & PGDIR_MASK; + task_size = task_size & PGDIR_MASK; /* OS sanity checks that need to happen before the kernel runs */ os_early_checks(); @@ -368,23 +374,18 @@ int __init linux_main(int argc, char **argv) setup_machinename(init_utsname()->machine); - highmem = 0; + physmem_size = (physmem_size + PAGE_SIZE - 1) & PAGE_MASK; iomem_size = (iomem_size + PAGE_SIZE - 1) & PAGE_MASK; - max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; - /* - * Zones have to begin on a 1 << MAX_PAGE_ORDER page boundary, - * so this makes sure that's true for highmem - */ - max_physmem &= ~((1 << (PAGE_SHIFT + MAX_PAGE_ORDER)) - 1); - if (physmem_size + iomem_size > max_physmem) { - highmem = physmem_size + iomem_size - max_physmem; - physmem_size -= highmem; + max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; + if (physmem_size > max_physmem) { + physmem_size = max_physmem; + os_info("Physical memory size shrunk to %llu bytes\n", + physmem_size); } high_physmem = uml_physmem + physmem_size; end_iomem = high_physmem + iomem_size; - high_memory = (void *) end_iomem; start_vm = VMALLOC_START; @@ -400,6 +401,8 @@ int __init linux_main(int argc, char **argv) os_info("Kernel virtual memory size shrunk to %lu bytes\n", virtmem_size); + arch_task_struct_size = sizeof(struct task_struct) + host_fp_size; + os_flush_stdout(); return start_uml(); @@ -414,9 +417,8 @@ void __init setup_arch(char **cmdline_p) { u8 rng_seed[32]; - stack_protections((unsigned long) &init_thread_info); - setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem); - mem_total_pages(physmem_size, iomem_size, highmem); + stack_protections((unsigned long) init_task.stack); + setup_physmem(uml_physmem, uml_reserved, physmem_size); uml_dtb_init(); read_initrd(); @@ -470,6 +472,11 @@ void *text_poke(void *addr, const void *opcode, size_t len) return memcpy(addr, opcode, len); } +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + return text_poke(addr, opcode, len); +} + void text_poke_sync(void) { } diff --git a/arch/um/kernel/um_arch.h b/arch/um/kernel/um_arch.h index 1e07fb7ee35e..46e731ab9dfc 100644 --- a/arch/um/kernel/um_arch.h +++ b/arch/um/kernel/um_arch.h @@ -11,4 +11,6 @@ extern void __init uml_dtb_init(void); static inline void uml_dtb_init(void) { } #endif +extern int __init read_initrd(void); + #endif diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 5c92d58a78e8..a409d4b66114 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -77,8 +77,6 @@ SECTIONS .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) - . = ALIGN(KERNEL_STACK_SIZE); - *(.data..init_irqstack) DATA_DATA *(.gnu.linkonce.d*) CONSTRUCTORS |