From 1bd21c6c21e848996339508d3ffb106d505256a8 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Thu, 5 Apr 2018 11:53:01 +0200 Subject: syscalls/core: Introduce CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y It may be useful for an architecture to override the definitions of the SYSCALL_DEFINE0() and __SYSCALL_DEFINEx() macros in , in particular to use a different calling convention for syscalls. This patch provides a mechanism to do so: It introduces CONFIG_ARCH_HAS_SYSCALL_WRAPPER. If it is enabled, is included in and may be used to define the macros mentioned above. Moreover, as the syscall calling convention may be different if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is set, the syscall function prototypes in are #ifndef'd out in that case. Signed-off-by: Dominik Brodowski Acked-by: Linus Torvalds Cc: Al Viro Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180405095307.3730-3-linux@dominikbrodowski.net Signed-off-by: Ingo Molnar --- include/linux/syscalls.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index b961184f597a..503ab245d4ce 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -81,6 +81,17 @@ union bpf_attr; #include #include +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +/* + * It may be useful for an architecture to override the definitions of the + * SYSCALL_DEFINE0() and __SYSCALL_DEFINEx() macros, in particular to use a + * different calling convention for syscalls. To allow for that, the prototypes + * for the sys_*() functions below will *not* be included if + * CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled. + */ +#include +#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ + /* * __MAP - apply a macro to syscall arguments * __MAP(n, m, t1, a1, t2, a2, ..., tn, an) will expand to @@ -189,11 +200,13 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) } #endif +#ifndef SYSCALL_DEFINE0 #define SYSCALL_DEFINE0(sname) \ SYSCALL_METADATA(_##sname, 0); \ asmlinkage long sys_##sname(void); \ ALLOW_ERROR_INJECTION(sys_##sname, ERRNO); \ asmlinkage long sys_##sname(void) +#endif /* SYSCALL_DEFINE0 */ #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) @@ -209,6 +222,8 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__) + +#ifndef __SYSCALL_DEFINEx #define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(SyS##name)))); \ @@ -223,6 +238,7 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) return ret; \ } \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) +#endif /* __SYSCALL_DEFINEx */ /* * Called before coming back to user-mode. Returning to user-mode with an @@ -252,7 +268,12 @@ static inline void addr_limit_user_check(void) * Please note that these prototypes here are only provided for information * purposes, for static analysis, and for linking from the syscall table. * These functions should not be called elsewhere from kernel code. + * + * As the syscall calling convention may be different from the default + * for architectures overriding the syscall calling convention, do not + * include the prototypes if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled. */ +#ifndef CONFIG_ARCH_HAS_SYSCALL_WRAPPER asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t __user *ctx); asmlinkage long sys_io_destroy(aio_context_t ctx); asmlinkage long sys_io_submit(aio_context_t, long, @@ -1076,6 +1097,8 @@ asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg); */ asmlinkage long sys_ni_syscall(void); +#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ + /* * Kernel code should not call syscalls (i.e., sys_xyzyyz()) directly. -- cgit v1.2.3 From fa697140f9a20119a9ec8fd7460cc4314fbdaff3 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Thu, 5 Apr 2018 11:53:02 +0200 Subject: syscalls/x86: Use 'struct pt_regs' based syscall calling convention for 64-bit syscalls Let's make use of ARCH_HAS_SYSCALL_WRAPPER=y on pure 64-bit x86-64 systems: Each syscall defines a stub which takes struct pt_regs as its only argument. It decodes just those parameters it needs, e.g: asmlinkage long sys_xyzzy(const struct pt_regs *regs) { return SyS_xyzzy(regs->di, regs->si, regs->dx); } This approach avoids leaking random user-provided register content down the call chain. For example, for sys_recv() which is a 4-parameter syscall, the assembly now is (in slightly reordered fashion): : callq <__fentry__> /* decode regs->di, ->si, ->dx and ->r10 */ mov 0x70(%rdi),%rdi mov 0x68(%rdi),%rsi mov 0x60(%rdi),%rdx mov 0x38(%rdi),%rcx [ SyS_recv() is automatically inlined by the compiler, as it is not [yet] used anywhere else ] /* clear %r9 and %r8, the 5th and 6th args */ xor %r9d,%r9d xor %r8d,%r8d /* do the actual work */ callq __sys_recvfrom /* cleanup and return */ cltq retq The only valid place in an x86-64 kernel which rightfully calls a syscall function on its own -- vsyscall -- needs to be modified to pass struct pt_regs onwards as well. To keep the syscall table generation working independent of SYSCALL_PTREGS being enabled, the stubs are named the same as the "original" syscall stubs, i.e. sys_*(). This patch is based on an original proof-of-concept | From: Linus Torvalds | Signed-off-by: Linus Torvalds and was split up and heavily modified by me, in particular to base it on ARCH_HAS_SYSCALL_WRAPPER, to limit it to 64-bit-only for the time being, and to update the vsyscall to the new calling convention. Signed-off-by: Dominik Brodowski Acked-by: Linus Torvalds Cc: Al Viro Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180405095307.3730-4-linux@dominikbrodowski.net Signed-off-by: Ingo Molnar --- include/linux/syscalls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 503ab245d4ce..d7168b3a4b4c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -102,7 +102,7 @@ union bpf_attr; * for SYSCALL_DEFINE/COMPAT_SYSCALL_DEFINE */ #define __MAP0(m,...) -#define __MAP1(m,t,a) m(t,a) +#define __MAP1(m,t,a,...) m(t,a) #define __MAP2(m,t,a,...) m(t,a), __MAP1(m,__VA_ARGS__) #define __MAP3(m,t,a,...) m(t,a), __MAP2(m,__VA_ARGS__) #define __MAP4(m,t,a,...) m(t,a), __MAP3(m,__VA_ARGS__) -- cgit v1.2.3 From 7303e30ec1d8fb5ca1f07c92d069241c32b2ee1b Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Thu, 5 Apr 2018 11:53:03 +0200 Subject: syscalls/core: Prepare CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y for compat syscalls It may be useful for an architecture to override the definitions of the COMPAT_SYSCALL_DEFINE0() and __COMPAT_SYSCALL_DEFINEx() macros in , in particular to use a different calling convention for syscalls. This patch provides a mechanism to do so, based on the previously introduced CONFIG_ARCH_HAS_SYSCALL_WRAPPER. If it is enabled, is included in and may be used to define the macros mentioned above. Moreover, as the syscall calling convention may be different if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is set, the compat syscall function prototypes in are #ifndef'd out in that case. As some of the syscalls and/or compat syscalls may not be present, the COND_SYSCALL() and COND_SYSCALL_COMPAT() macros in kernel/sys_ni.c as well as the SYS_NI() and COMPAT_SYS_NI() macros in kernel/time/posix-stubs.c can be re-defined in iff CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled. Signed-off-by: Dominik Brodowski Acked-by: Linus Torvalds Cc: Al Viro Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180405095307.3730-5-linux@dominikbrodowski.net Signed-off-by: Ingo Molnar --- include/linux/compat.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/compat.h b/include/linux/compat.h index 9847c5a013c3..2d85ec5cfda2 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -24,6 +24,17 @@ #include #include +#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER +/* + * It may be useful for an architecture to override the definitions of the + * COMPAT_SYSCALL_DEFINE0 and COMPAT_SYSCALL_DEFINEx() macros, in particular + * to use a different calling convention for syscalls. To allow for that, + + the prototypes for the compat_sys_*() functions below will *not* be included + * if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled. + */ +#include +#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ + #ifndef COMPAT_USE_64BIT_TIME #define COMPAT_USE_64BIT_TIME 0 #endif @@ -32,10 +43,12 @@ #define __SC_DELOUSE(t,v) ((__force t)(unsigned long)(v)) #endif +#ifndef COMPAT_SYSCALL_DEFINE0 #define COMPAT_SYSCALL_DEFINE0(name) \ asmlinkage long compat_sys_##name(void); \ ALLOW_ERROR_INJECTION(compat_sys_##name, ERRNO); \ asmlinkage long compat_sys_##name(void) +#endif /* COMPAT_SYSCALL_DEFINE0 */ #define COMPAT_SYSCALL_DEFINE1(name, ...) \ COMPAT_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) @@ -50,6 +63,7 @@ #define COMPAT_SYSCALL_DEFINE6(name, ...) \ COMPAT_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) +#ifndef COMPAT_SYSCALL_DEFINEx #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))\ @@ -62,6 +76,7 @@ return C_SYSC##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \ } \ static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) +#endif /* COMPAT_SYSCALL_DEFINEx */ #ifndef compat_user_stack_pointer #define compat_user_stack_pointer() current_user_stack_pointer() @@ -517,7 +532,12 @@ int __compat_save_altstack(compat_stack_t __user *, unsigned long); * Please note that these prototypes here are only provided for information * purposes, for static analysis, and for linking from the syscall table. * These functions should not be called elsewhere from kernel code. + * + * As the syscall calling convention may be different from the default + * for architectures overriding the syscall calling convention, do not + * include the prototypes if CONFIG_ARCH_HAS_SYSCALL_WRAPPER is enabled. */ +#ifndef CONFIG_ARCH_HAS_SYSCALL_WRAPPER asmlinkage long compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p); asmlinkage long compat_sys_io_submit(compat_aio_context_t ctx_id, int nr, u32 __user *iocb); @@ -955,6 +975,8 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr); /* obsolete: net/socket.c */ asmlinkage long compat_sys_socketcall(int call, u32 __user *args); +#endif /* CONFIG_ARCH_HAS_SYSCALL_WRAPPER */ + /* * For most but not all architectures, "am I in a compat syscall?" and -- cgit v1.2.3 From e145242ea0df6b7d28fd7186e61d6840fa4bb06e Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Mon, 9 Apr 2018 12:51:42 +0200 Subject: syscalls/core, syscalls/x86: Clean up syscall stub naming convention Tidy the naming convention for compat syscall subs. Hints which describe the purpose of the stub go in front and receive a double underscore to denote that they are generated on-the-fly by the SYSCALL_DEFINEx() macro. For the generic case, this means (0xffffffff prefix removed): 810f08d0 t kernel_waitid # common C function (see kernel/exit.c) __do_sys_waitid # inlined helper doing the actual work # (takes original parameters as declared) 810f1aa0 T __se_sys_waitid # sign-extending C function calling inlined # helper (takes parameters of type long; # casts them to the declared type) 810f1aa0 T sys_waitid # alias to __se_sys_waitid() (taking # parameters as declared), to be included # in syscall table For x86, the naming is as follows: 810efc70 t kernel_waitid # common C function (see kernel/exit.c) __do_sys_waitid # inlined helper doing the actual work # (takes original parameters as declared) 810efd60 t __se_sys_waitid # sign-extending C function calling inlined # helper (takes parameters of type long; # casts them to the declared type) 810f1140 T __ia32_sys_waitid # IA32_EMULATION 32-bit-ptregs -> C stub, # calls __se_sys_waitid(); to be included # in syscall table 810f1110 T sys_waitid # x86 64-bit-ptregs -> C stub, calls # __se_sys_waitid(); to be included in # syscall table For x86, sys_waitid() will be re-named to __x64_sys_waitid in a follow-up patch. Suggested-by: Ingo Molnar Signed-off-by: Dominik Brodowski Cc: Al Viro Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180409105145.5364-2-linux@dominikbrodowski.net Signed-off-by: Ingo Molnar --- include/linux/syscalls.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index d7168b3a4b4c..70fcda1a9049 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -223,21 +223,26 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__) +/* + * The asmlinkage stub is aliased to a function named __se_sys_*() which + * sign-extends 32-bit ints to longs whenever needed. The actual work is + * done within __do_sys_*(). + */ #ifndef __SYSCALL_DEFINEx #define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ - __attribute__((alias(__stringify(SyS##name)))); \ + __attribute__((alias(__stringify(__se_sys##name)))); \ ALLOW_ERROR_INJECTION(sys##name, ERRNO); \ - static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ - asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ - asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ + asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ { \ - long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ + long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\ __MAP(x,__SC_TEST,__VA_ARGS__); \ __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ return ret; \ } \ - static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) #endif /* __SYSCALL_DEFINEx */ /* -- cgit v1.2.3 From 5ac9efa3c50d7caff9f3933bb8a3ad1139d92d92 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Mon, 9 Apr 2018 12:51:43 +0200 Subject: syscalls/core, syscalls/x86: Clean up compat syscall stub naming convention Tidy the naming convention for compat syscall subs. Hints which describe the purpose of the stub go in front and receive a double underscore to denote that they are generated on-the-fly by the COMPAT_SYSCALL_DEFINEx() macro. For the generic case, this means: t kernel_waitid # common C function (see kernel/exit.c) __do_compat_sys_waitid # inlined helper doing the actual work # (takes original parameters as declared) T __se_compat_sys_waitid # sign-extending C function calling inlined # helper (takes parameters of type long, # casts them to unsigned long and then to # the declared type) T compat_sys_waitid # alias to __se_compat_sys_waitid() # (taking parameters as declared), to # be included in syscall table For x86, the naming is as follows: t kernel_waitid # common C function (see kernel/exit.c) __do_compat_sys_waitid # inlined helper doing the actual work # (takes original parameters as declared) t __se_compat_sys_waitid # sign-extending C function calling inlined # helper (takes parameters of type long, # casts them to unsigned long and then to # the declared type) T __ia32_compat_sys_waitid # IA32_EMULATION 32-bit-ptregs -> C stub, # calls __se_compat_sys_waitid(); to be # included in syscall table T __x32_compat_sys_waitid # x32 64-bit-ptregs -> C stub, calls # __se_compat_sys_waitid(); to be included # in syscall table If only one of IA32_EMULATION and x32 is enabled, __se_compat_sys_waitid() may be inlined into the stub __{ia32,x32}_compat_sys_waitid(). Suggested-by: Ingo Molnar Signed-off-by: Dominik Brodowski Cc: Al Viro Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180409105145.5364-3-linux@dominikbrodowski.net Signed-off-by: Ingo Molnar --- include/linux/compat.h | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/compat.h b/include/linux/compat.h index 2d85ec5cfda2..aca050aac7b6 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -63,19 +63,24 @@ #define COMPAT_SYSCALL_DEFINE6(name, ...) \ COMPAT_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) +/* + * The asmlinkage stub is aliased to a function named __se_compat_sys_*() which + * sign-extends 32-bit ints to longs whenever needed. The actual work is + * done within __do_compat_sys_*(). + */ #ifndef COMPAT_SYSCALL_DEFINEx -#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ - asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ - asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))\ - __attribute__((alias(__stringify(compat_SyS##name)))); \ - ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \ - static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ - asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));\ - asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))\ - { \ - return C_SYSC##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \ - } \ - static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) +#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ + asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ + asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ + __attribute__((alias(__stringify(__se_compat_sys##name)))); \ + ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \ + static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ + asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + { \ + return __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\ + } \ + static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) #endif /* COMPAT_SYSCALL_DEFINEx */ #ifndef compat_user_stack_pointer -- cgit v1.2.3