From 976516404ff3fab2a8caa8bd6f5efc1437fed0b8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Aug 2018 14:02:57 +0200 Subject: y2038: remove unused time interfaces After many small patches, at least some of the deprecated interfaces have no remaining users any more and can be removed: current_kernel_time do_settimeofday get_monotonic_boottime get_monotonic_boottime64 get_monotonic_coarse get_monotonic_coarse64 getrawmonotonic64 ktime_get_real_ts timekeeping_clocktai timespec_trunc timespec_valid_strict time_to_tm For many of the remaining time functions, we are missing one or two patches that failed to make it into 4.19, they will be removed in the following merge window. The replacement functions for the removed interfaces are documented in Documentation/core-api/timekeeping.rst. Signed-off-by: Arnd Bergmann --- kernel/time/time.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index ccdb351277ee..712543011106 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -342,30 +342,6 @@ unsigned int jiffies_to_usecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_usecs); -/** - * timespec_trunc - Truncate timespec to a granularity - * @t: Timespec - * @gran: Granularity in ns. - * - * Truncate a timespec to a granularity. Always rounds down. gran must - * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). - */ -struct timespec timespec_trunc(struct timespec t, unsigned gran) -{ - /* Avoid division in the common cases 1 ns and 1 s. */ - if (gran == 1) { - /* nothing */ - } else if (gran == NSEC_PER_SEC) { - t.tv_nsec = 0; - } else if (gran > 1 && gran < NSEC_PER_SEC) { - t.tv_nsec -= t.tv_nsec % gran; - } else { - WARN(1, "illegal file time granularity: %u", gran); - } - return t; -} -EXPORT_SYMBOL(timespec_trunc); - /* * mktime64 - Converts date to seconds. * Converts Gregorian date to seconds since 1970-01-01 00:00:00. -- cgit v1.2.3 From 33e26418193f58d1895f2f968e1953b1caf8deb7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Aug 2018 15:18:20 +0200 Subject: y2038: make do_gettimeofday() and get_seconds() inline get_seconds() and do_gettimeofday() are only used by a few modules now any more (waiting for the respective patches to get accepted), and they are among the last holdouts of code that is not y2038 safe in the core kernel. Move the implementation into the timekeeping32.h header to clean up the core kernel and isolate the old interfaces further. Signed-off-by: Arnd Bergmann --- kernel/time/time.c | 15 +++++++++------ kernel/time/timekeeping.c | 24 ------------------------ 2 files changed, 9 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 712543011106..de332250d6fa 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -144,9 +144,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, struct timezone __user *, tz) { if (likely(tv != NULL)) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (copy_to_user(tv, &ktv, sizeof(ktv))) + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (unlikely(tz != NULL)) { @@ -227,10 +229,11 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, struct timezone __user *, tz) { if (tv) { - struct timeval ktv; + struct timespec64 ts; - do_gettimeofday(&ktv); - if (compat_put_timeval(&ktv, tv)) + ktime_get_real_ts64(&ts); + if (put_user(ts.tv_sec, &tv->tv_sec) || + put_user(ts.tv_nsec / 1000, &tv->tv_usec)) return -EFAULT; } if (tz) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f3b22f456fac..2d110c948805 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1211,22 +1211,6 @@ int get_device_system_crosststamp(int (*get_time_fn) } EXPORT_SYMBOL_GPL(get_device_system_crosststamp); -/** - * do_gettimeofday - Returns the time of day in a timeval - * @tv: pointer to the timeval to be set - * - * NOTE: Users should be converted to using getnstimeofday() - */ -void do_gettimeofday(struct timeval *tv) -{ - struct timespec64 now; - - getnstimeofday64(&now); - tv->tv_sec = now.tv_sec; - tv->tv_usec = now.tv_nsec/1000; -} -EXPORT_SYMBOL(do_gettimeofday); - /** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time @@ -2174,14 +2158,6 @@ void getboottime64(struct timespec64 *ts) } EXPORT_SYMBOL_GPL(getboottime64); -unsigned long get_seconds(void) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - return tk->xtime_sec; -} -EXPORT_SYMBOL(get_seconds); - void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; -- cgit v1.2.3 From 9afc5eee65ca7d717a99d6fe8f4adfe32a40940a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Jul 2018 12:52:28 +0200 Subject: y2038: globally rename compat_time to old_time32 Christoph Hellwig suggested a slightly different path for handling backwards compatibility with the 32-bit time_t based system calls: Rather than simply reusing the compat_sys_* entry points on 32-bit architectures unchanged, we get rid of those entry points and the compat_time types by renaming them to something that makes more sense on 32-bit architectures (which don't have a compat mode otherwise), and then share the entry points under the new name with the 64-bit architectures that use them for implementing the compatibility. The following types and interfaces are renamed here, and moved from linux/compat_time.h to linux/time32.h: old new --- --- compat_time_t old_time32_t struct compat_timeval struct old_timeval32 struct compat_timespec struct old_timespec32 struct compat_itimerspec struct old_itimerspec32 ns_to_compat_timeval() ns_to_old_timeval32() get_compat_itimerspec64() get_old_itimerspec32() put_compat_itimerspec64() put_old_itimerspec32() compat_get_timespec64() get_old_timespec32() compat_put_timespec64() put_old_timespec32() As we already have aliases in place, this patch addresses only the instances that are relevant to the system call interface in particular, not those that occur in device drivers and other modules. Those will get handled separately, while providing the 64-bit version of the respective interfaces. I'm not renaming the timex, rusage and itimerval structures, as we are still debating what the new interface will look like, and whether we will need a replacement at all. This also doesn't change the names of the syscall entry points, which can be done more easily when we actually switch over the 32-bit architectures to use them, at that point we need to change COMPAT_SYSCALL_DEFINEx to SYSCALL_DEFINEx with a new name, e.g. with a _time32 suffix. Suggested-by: Christoph Hellwig Link: https://lore.kernel.org/lkml/20180705222110.GA5698@infradead.org/ Signed-off-by: Arnd Bergmann --- kernel/compat.c | 8 +++---- kernel/futex_compat.c | 2 +- kernel/sched/core.c | 4 ++-- kernel/signal.c | 2 +- kernel/time/hrtimer.c | 8 +++---- kernel/time/posix-stubs.c | 18 +++++++------- kernel/time/posix-timers.c | 30 ++++++++++++------------ kernel/time/time.c | 58 +++++++++++++++++++++++----------------------- 8 files changed, 65 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 8e40efc2928a..089d00d0da9c 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -93,28 +93,28 @@ int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) return 0; } -static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) +static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) { return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || __get_user(tv->tv_sec, &ctv->tv_sec) || __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) +static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) { return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || __put_user(tv->tv_sec, &ctv->tv_sec) || __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts) +static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts) +static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) { return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || __put_user(ts->tv_sec, &cts->tv_sec) || diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 83f830acbb5f..410a77a8f6e2 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -173,7 +173,7 @@ err_unlock: } COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct compat_timespec __user *, utime, u32 __user *, uaddr2, + struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { struct timespec ts; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 625bc9897f62..8287b75ed961 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5257,13 +5257,13 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, compat_pid_t, pid, - struct compat_timespec __user *, interval) + struct old_timespec32 __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); if (retval == 0) - retval = compat_put_timespec64(&t, interval); + retval = put_old_timespec32(&t, interval); return retval; } #endif diff --git a/kernel/signal.c b/kernel/signal.c index 5843c541fda9..a4db724e14c1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3173,7 +3173,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, - struct compat_timespec __user *, uts, compat_size_t, sigsetsize) + struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { sigset_t s; struct timespec t; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e1a549c9e399..9cdd74bd2d27 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1660,7 +1660,7 @@ int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: - if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) + if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif @@ -1780,12 +1780,12 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) +COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 tu; - if (compat_get_timespec64(&tu, rqtp)) + if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 2c6847d5d69b..989ccf028bde 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -162,20 +162,20 @@ COMPAT_SYS_NI(setitimer); #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { struct timespec64 new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; - if (compat_get_timespec64(&new_tp, tp)) + if (get_old_timespec32(&new_tp, tp)) return -EFAULT; return do_sys_settimeofday64(&new_tp, NULL); } COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { int ret; struct timespec64 kernel_tp; @@ -184,13 +184,13 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, if (ret) return ret; - if (compat_put_timespec64(&kernel_tp, tp)) + if (put_old_timespec32(&kernel_tp, tp)) return -EFAULT; return 0; } COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { struct timespec64 rtn_tp = { .tv_sec = 0, @@ -201,7 +201,7 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (compat_put_timespec64(&rtn_tp, tp)) + if (put_old_timespec32(&rtn_tp, tp)) return -EFAULT; return 0; default: @@ -210,8 +210,8 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, } COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 t; @@ -224,7 +224,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EINVAL; } - if (compat_get_timespec64(&t, rqtp)) + if (get_old_timespec32(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) return -EINVAL; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4b9127e95430..3e71921668ba 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -755,13 +755,13 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct compat_itimerspec __user *, setting) + struct old_itimerspec32 __user *, setting) { struct itimerspec64 cur_setting; int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { - if (put_compat_itimerspec64(&cur_setting, setting)) + if (put_old_itimerspec32(&cur_setting, setting)) ret = -EFAULT; } return ret; @@ -928,8 +928,8 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - struct compat_itimerspec __user *, new, - struct compat_itimerspec __user *, old) + struct old_itimerspec32 __user *, new, + struct old_itimerspec32 __user *, old) { struct itimerspec64 new_spec, old_spec; struct itimerspec64 *rtn = old ? &old_spec : NULL; @@ -937,12 +937,12 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, if (!new) return -EINVAL; - if (get_compat_itimerspec64(&new_spec, new)) + if (get_old_itimerspec32(&new_spec, new)) return -EFAULT; error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old) { - if (put_compat_itimerspec64(&old_spec, old)) + if (put_old_itimerspec32(&old_spec, old)) error = -EFAULT; } return error; @@ -1115,7 +1115,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1123,14 +1123,14 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, if (!kc || !kc->clock_set) return -EINVAL; - if (compat_get_timespec64(&ts, tp)) + if (get_old_timespec32(&ts, tp)) return -EFAULT; return kc->clock_set(which_clock, &ts); } COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1141,7 +1141,7 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, err = kc->clock_get(which_clock, &ts); - if (!err && compat_put_timespec64(&ts, tp)) + if (!err && put_old_timespec32(&ts, tp)) err = -EFAULT; return err; @@ -1180,7 +1180,7 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct compat_timespec __user *, tp) + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1190,7 +1190,7 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, return -EINVAL; err = kc->clock_getres(which_clock, &ts); - if (!err && tp && compat_put_timespec64(&ts, tp)) + if (!err && tp && put_old_timespec32(&ts, tp)) return -EFAULT; return err; @@ -1237,8 +1237,8 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; @@ -1248,7 +1248,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, if (!kc->nsleep) return -EOPNOTSUPP; - if (compat_get_timespec64(&t, rqtp)) + if (get_old_timespec32(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) diff --git a/kernel/time/time.c b/kernel/time/time.c index de332250d6fa..f1983f468fe3 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -104,12 +104,12 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_TIME -/* compat_time_t is a 32 bit "long" and needs to get converted. */ -COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) +/* old_time32_t is a 32 bit "long" and needs to get converted. */ +COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) { - compat_time_t i; + old_time32_t i; - i = (compat_time_t)ktime_get_real_seconds(); + i = (old_time32_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) @@ -119,7 +119,7 @@ COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) return i; } -COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) +COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr) { struct timespec64 tv; int err; @@ -225,7 +225,7 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, +COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { if (tv) { @@ -244,7 +244,7 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, return 0; } -COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, +COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; @@ -863,10 +863,10 @@ int put_timespec64(const struct timespec64 *ts, } EXPORT_SYMBOL_GPL(put_timespec64); -int __compat_get_timespec64(struct timespec64 *ts64, - const struct compat_timespec __user *cts) +int __get_old_timespec32(struct timespec64 *ts64, + const struct old_timespec32 __user *cts) { - struct compat_timespec ts; + struct old_timespec32 ts; int ret; ret = copy_from_user(&ts, cts, sizeof(ts)); @@ -879,33 +879,33 @@ int __compat_get_timespec64(struct timespec64 *ts64, return 0; } -int __compat_put_timespec64(const struct timespec64 *ts64, - struct compat_timespec __user *cts) +int __put_old_timespec32(const struct timespec64 *ts64, + struct old_timespec32 __user *cts) { - struct compat_timespec ts = { + struct old_timespec32 ts = { .tv_sec = ts64->tv_sec, .tv_nsec = ts64->tv_nsec }; return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; } -int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) +int get_old_timespec32(struct timespec64 *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; else - return __compat_get_timespec64(ts, uts); + return __get_old_timespec32(ts, uts); } -EXPORT_SYMBOL_GPL(compat_get_timespec64); +EXPORT_SYMBOL_GPL(get_old_timespec32); -int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) +int put_old_timespec32(const struct timespec64 *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; else - return __compat_put_timespec64(ts, uts); + return __put_old_timespec32(ts, uts); } -EXPORT_SYMBOL_GPL(compat_put_timespec64); +EXPORT_SYMBOL_GPL(put_old_timespec32); int get_itimerspec64(struct itimerspec64 *it, const struct __kernel_itimerspec __user *uit) @@ -937,23 +937,23 @@ int put_itimerspec64(const struct itimerspec64 *it, } EXPORT_SYMBOL_GPL(put_itimerspec64); -int get_compat_itimerspec64(struct itimerspec64 *its, - const struct compat_itimerspec __user *uits) +int get_old_itimerspec32(struct itimerspec64 *its, + const struct old_itimerspec32 __user *uits) { - if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) || - __compat_get_timespec64(&its->it_value, &uits->it_value)) + if (__get_old_timespec32(&its->it_interval, &uits->it_interval) || + __get_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(get_compat_itimerspec64); +EXPORT_SYMBOL_GPL(get_old_itimerspec32); -int put_compat_itimerspec64(const struct itimerspec64 *its, - struct compat_itimerspec __user *uits) +int put_old_itimerspec32(const struct itimerspec64 *its, + struct old_itimerspec32 __user *uits) { - if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) || - __compat_put_timespec64(&its->it_value, &uits->it_value)) + if (__put_old_timespec32(&its->it_interval, &uits->it_interval) || + __put_old_timespec32(&its->it_value, &uits->it_value)) return -EFAULT; return 0; } -EXPORT_SYMBOL_GPL(put_compat_itimerspec64); +EXPORT_SYMBOL_GPL(put_old_itimerspec32); -- cgit v1.2.3 From 743f5cdb6cec33d2300922f6b1b1670a572595ad Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Wed, 29 Aug 2018 20:50:52 +0800 Subject: y2038: __get_old_timespec32() can be static The kbuild test robot reports two new warnings with the previous patch: kernel/time/time.c:866:5: sparse: symbol '__get_old_timespec32' was not declared. Should it be static? kernel/time/time.c:882:5: sparse: symbol '__put_old_timespec32' was not declared. Should it be static? These are actually older bugs, but came up now after the symbol got renamed. Fortunately, commit afef05cf238c ("time: Enable get/put_compat_itimerspec64 always") makes the two functions (__compat_get_timespec64/__compat_get_timespec64) local to time.c already, so we can mark them as 'static'. Fixes: ee16c8f415e4 ("y2038: Globally rename compat_time to old_time32") Signed-off-by: kbuild test robot [arnd: added changelog text] Signed-off-by: Arnd Bergmann --- kernel/time/time.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index f1983f468fe3..e3a7f7fd3abc 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -863,7 +863,7 @@ int put_timespec64(const struct timespec64 *ts, } EXPORT_SYMBOL_GPL(put_timespec64); -int __get_old_timespec32(struct timespec64 *ts64, +static int __get_old_timespec32(struct timespec64 *ts64, const struct old_timespec32 __user *cts) { struct old_timespec32 ts; @@ -879,7 +879,7 @@ int __get_old_timespec32(struct timespec64 *ts64, return 0; } -int __put_old_timespec32(const struct timespec64 *ts64, +static int __put_old_timespec32(const struct timespec64 *ts64, struct old_timespec32 __user *cts) { struct old_timespec32 ts = { -- cgit v1.2.3 From 474b9c777b20b8340a6ee0f7ba6ebbd6a4bf47e2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Apr 2018 21:59:47 +0200 Subject: y2038: sched: Change sched_rr_get_interval to use __kernel_timespec This is a preparation patch for converting sys_sched_rr_get_interval to work with 64-bit time_t on 32-bit architectures. The 'interval' argument is changed to struct __kernel_timespec, which will be redefined using 64-bit time_t in the future. The compat version of the system call in turn is enabled for compilation with CONFIG_COMPAT_32BIT_TIME so the individual 32-bit architectures can share the handling of the traditional argument with 64-bit architectures providing it for their compat mode. Signed-off-by: Arnd Bergmann --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8287b75ed961..39af2bec2b39 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5243,7 +5243,7 @@ out_unlock: * an error code. */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) + struct __kernel_timespec __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); @@ -5254,7 +5254,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, return retval; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, compat_pid_t, pid, struct old_timespec32 __user *, interval) -- cgit v1.2.3 From 49c39f8464a9af702e9d45800c00a572753aeb06 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Apr 2018 15:56:13 +0200 Subject: y2038: signal: Change rt_sigtimedwait to use __kernel_timespec This changes sys_rt_sigtimedwait() to use get_timespec64(), changing the timeout type to __kernel_timespec, which will be changed to use a 64-bit time_t in the future. Since the do_sigtimedwait() core function changes, we also have to modify the compat version of this system call in the same way. Signed-off-by: Arnd Bergmann --- kernel/signal.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a4db724e14c1..0831d56a731a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3082,7 +3082,7 @@ int copy_siginfo_from_user32(struct siginfo *to, * @ts: upper bound on process time suspension */ static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, - const struct timespec *ts) + const struct timespec64 *ts) { ktime_t *to = NULL, timeout = KTIME_MAX; struct task_struct *tsk = current; @@ -3090,9 +3090,9 @@ static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, int sig, ret = 0; if (ts) { - if (!timespec_valid(ts)) + if (!timespec64_valid(ts)) return -EINVAL; - timeout = timespec_to_ktime(*ts); + timeout = timespec64_to_ktime(*ts); to = &timeout; } @@ -3140,11 +3140,12 @@ static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, - siginfo_t __user *, uinfo, const struct timespec __user *, uts, + siginfo_t __user *, uinfo, + const struct __kernel_timespec __user *, uts, size_t, sigsetsize) { sigset_t these; - struct timespec ts; + struct timespec64 ts; siginfo_t info; int ret; @@ -3156,7 +3157,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return -EFAULT; if (uts) { - if (copy_from_user(&ts, uts, sizeof(ts))) + if (get_timespec64(&ts, uts)) return -EFAULT; } @@ -3176,7 +3177,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { sigset_t s; - struct timespec t; + struct timespec64 t; siginfo_t info; long ret; @@ -3187,7 +3188,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, return -EFAULT; if (uts) { - if (compat_get_timespec(&t, uts)) + if (get_old_timespec32(&t, uts)) return -EFAULT; } -- cgit v1.2.3 From efbaec89c642cd1d4977fc6df9923697e1598d4e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 28 Aug 2018 07:42:32 +0000 Subject: bpf: remove duplicated include from syscall.c Remove duplicated include. Signed-off-by: YueHaibing Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8339d81cba1d..3c9636f03bb2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ -- cgit v1.2.3 From 679c782de14bd48c19dd74cd1af20a2bc05dd936 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 22 Aug 2018 20:02:19 +0100 Subject: bpf/verifier: per-register parent pointers By giving each register its own liveness chain, we elide the skip_callee() logic. Instead, each register's parent is the state it inherits from; both check_func_call() and prepare_func_exit() automatically connect reg states to the correct chain since when they copy the reg state across (r1-r5 into the callee as args, and r0 out as the return value) they also copy the parent pointer. Signed-off-by: Edward Cree Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 184 ++++++++++++-------------------------------------- 1 file changed, 44 insertions(+), 140 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92246117d2b0..68568d22d6bd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -380,9 +380,9 @@ static int copy_stack_state(struct bpf_func_state *dst, /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from * the program calls into realloc_func_state() to grow the stack size. - * Note there is a non-zero 'parent' pointer inside bpf_verifier_state - * which this function copies over. It points to previous bpf_verifier_state - * which is never reallocated + * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state + * which this function copies over. It points to corresponding reg in previous + * bpf_verifier_state which is never reallocated */ static int realloc_func_state(struct bpf_func_state *state, int size, bool copy_old) @@ -466,7 +466,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->frame[i] = NULL; } dst_state->curframe = src->curframe; - dst_state->parent = src->parent; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -732,6 +731,7 @@ static void init_reg_state(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) { mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; + regs[i].parent = NULL; } /* frame pointer */ @@ -876,74 +876,21 @@ next: return 0; } -static -struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) -{ - struct bpf_verifier_state *tmp = NULL; - - /* 'parent' could be a state of caller and - * 'state' could be a state of callee. In such case - * parent->curframe < state->curframe - * and it's ok for r1 - r5 registers - * - * 'parent' could be a callee's state after it bpf_exit-ed. - * In such case parent->curframe > state->curframe - * and it's ok for r0 only - */ - if (parent->curframe == state->curframe || - (parent->curframe < state->curframe && - regno >= BPF_REG_1 && regno <= BPF_REG_5) || - (parent->curframe > state->curframe && - regno == BPF_REG_0)) - return parent; - - if (parent->curframe > state->curframe && - regno >= BPF_REG_6) { - /* for callee saved regs we have to skip the whole chain - * of states that belong to callee and mark as LIVE_READ - * the registers before the call - */ - tmp = parent; - while (tmp && tmp->curframe != state->curframe) { - tmp = tmp->parent; - } - if (!tmp) - goto bug; - parent = tmp; - } else { - goto bug; - } - return parent; -bug: - verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); - verbose(env, "regno %d parent frame %d current frame %d\n", - regno, parent->curframe, state->curframe); - return NULL; -} - +/* Parentage chain of this register (or stack slot) should take care of all + * issues like callee-saved registers, stack slot allocation time, etc. + */ static int mark_reg_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - u32 regno) + const struct bpf_reg_state *state, + struct bpf_reg_state *parent) { bool writes = parent == state->parent; /* Observe write marks */ - if (regno == BPF_REG_FP) - /* We don't need to worry about FP liveness because it's read-only */ - return 0; - while (parent) { /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) + if (writes && state->live & REG_LIVE_WRITTEN) break; - parent = skip_callee(env, state, parent, regno); - if (!parent) - return -EFAULT; /* ... then we depend on parent's value */ - parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; + parent->live |= REG_LIVE_READ; state = parent; parent = state->parent; writes = true; @@ -969,7 +916,10 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - return mark_reg_read(env, vstate, vstate->parent, regno); + /* We don't need to worry about FP liveness because it's read-only */ + if (regno != BPF_REG_FP) + return mark_reg_read(env, ®s[regno], + regs[regno].parent); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -1080,8 +1030,8 @@ static int check_stack_write(struct bpf_verifier_env *env, } else { u8 type = STACK_MISC; - /* regular write of data into stack */ - state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; + /* regular write of data into stack destroys any spilled ptr */ + state->stack[spi].spilled_ptr.type = NOT_INIT; /* only mark the slot as written if all 8 bytes were written * otherwise read propagation may incorrectly stop too soon @@ -1106,61 +1056,6 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -/* registers of every function are unique and mark_reg_read() propagates - * the liveness in the following cases: - * - from callee into caller for R1 - R5 that were used as arguments - * - from caller into callee for R0 that used as result of the call - * - from caller to the same caller skipping states of the callee for R6 - R9, - * since R6 - R9 are callee saved by implicit function prologue and - * caller's R6 != callee's R6, so when we propagate liveness up to - * parent states we need to skip callee states for R6 - R9. - * - * stack slot marking is different, since stacks of caller and callee are - * accessible in both (since caller can pass a pointer to caller's stack to - * callee which can pass it to another function), hence mark_stack_slot_read() - * has to propagate the stack liveness to all parent states at given frame number. - * Consider code: - * f1() { - * ptr = fp - 8; - * *ptr = ctx; - * call f2 { - * .. = *ptr; - * } - * .. = *ptr; - * } - * First *ptr is reading from f1's stack and mark_stack_slot_read() has - * to mark liveness at the f1's frame and not f2's frame. - * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has - * to propagate liveness to f2 states at f1's frame level and further into - * f1 states at f1's frame level until write into that stack slot - */ -static void mark_stack_slot_read(struct bpf_verifier_env *env, - const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent, - int slot, int frameno) -{ - bool writes = parent == state->parent; /* Observe write marks */ - - while (parent) { - if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) - /* since LIVE_WRITTEN mark is only done for full 8-byte - * write the read marks are conservative and parent - * state may not even have the stack allocated. In such case - * end the propagation, since the loop reached beginning - * of the function - */ - break; - /* if read wasn't screened by an earlier write ... */ - if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) - break; - /* ... then we depend on parent's value */ - parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; - state = parent; - parent = state->parent; - writes = true; - } -} - static int check_stack_read(struct bpf_verifier_env *env, struct bpf_func_state *reg_state /* func where register points to */, int off, int size, int value_regno) @@ -1198,8 +1093,8 @@ static int check_stack_read(struct bpf_verifier_env *env, */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); + mark_reg_read(env, ®_state->stack[spi].spilled_ptr, + reg_state->stack[spi].spilled_ptr.parent); return 0; } else { int zeros = 0; @@ -1215,8 +1110,8 @@ static int check_stack_read(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); + mark_reg_read(env, ®_state->stack[spi].spilled_ptr, + reg_state->stack[spi].spilled_ptr.parent); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, @@ -1908,8 +1803,8 @@ mark: /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read' */ - mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, - spi, state->frameno); + mark_reg_read(env, &state->stack[spi].spilled_ptr, + state->stack[spi].spilled_ptr.parent); } return update_stack_depth(env, state, off); } @@ -2366,11 +2261,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, state->curframe + 1 /* frameno within this callchain */, subprog /* subprog number within this prog */); - /* copy r1 - r5 args that callee can access */ + /* copy r1 - r5 args that callee can access. The copy includes parent + * pointers, which connects us up to the liveness chain + */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) callee->regs[i] = caller->regs[i]; - /* after the call regsiters r0 - r5 were scratched */ + /* after the call registers r0 - r5 were scratched */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, caller->regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); @@ -4370,7 +4267,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, /* explored state didn't use this */ return true; - equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; + equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; if (rold->type == PTR_TO_STACK) /* two stack pointers are equal only if they're pointing to @@ -4603,7 +4500,7 @@ static bool states_equal(struct bpf_verifier_env *env, * equivalent state (jump target or such) we didn't arrive by the straight-line * code, so read marks in the state must propagate to the parent regardless * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() and mark_stack_slot_read() is for. + * in mark_reg_read() is for. */ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, @@ -4624,7 +4521,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) continue; if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, vstate, vparent, i); + err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], + &vparent->frame[vstate->curframe]->regs[i]); if (err) return err; } @@ -4639,7 +4537,8 @@ static int propagate_liveness(struct bpf_verifier_env *env, if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) - mark_stack_slot_read(env, vstate, vparent, i, frame); + mark_reg_read(env, &state->stack[i].spilled_ptr, + &parent->stack[i].spilled_ptr); } } return err; @@ -4649,7 +4548,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; - struct bpf_verifier_state *cur = env->cur_state; + struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err; sl = env->explored_states[insn_idx]; @@ -4691,16 +4590,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM; /* add new state to the head of linked list */ - err = copy_verifier_state(&new_sl->state, cur); + new = &new_sl->state; + err = copy_verifier_state(new, cur); if (err) { - free_verifier_state(&new_sl->state, false); + free_verifier_state(new, false); kfree(new_sl); return err; } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ - cur->parent = &new_sl->state; + for (i = 0; i < BPF_REG_FP; i++) + cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark @@ -4713,9 +4614,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* all stack frames are accessible from callee, clear them all */ for (j = 0; j <= cur->curframe; j++) { struct bpf_func_state *frame = cur->frame[j]; + struct bpf_func_state *newframe = new->frame[j]; - for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) + for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) { frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + frame->stack[i].spilled_ptr.parent = + &newframe->stack[i].spilled_ptr; + } } return 0; } @@ -4734,7 +4639,6 @@ static int do_check(struct bpf_verifier_env *env) if (!state) return -ENOMEM; state->curframe = 0; - state->parent = NULL; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); -- cgit v1.2.3 From 8efea21d333d21e1f9177579ffdc69556314f603 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 22 Aug 2018 20:02:44 +0100 Subject: bpf/verifier: display non-spill stack slot types in print_verifier_state If a stack slot does not hold a spilled register (STACK_SPILL), then each of its eight bytes could potentially have a different slot_type. This information can be important for debugging, and previously we either did not print anything for the stack slot, or just printed fp-X=0 in the case where its first byte was STACK_ZERO. Instead, print eight characters with either 0 (STACK_ZERO), m (STACK_MISC) or ? (STACK_INVALID) for any stack slot which is neither STACK_SPILL nor entirely STACK_INVALID. Signed-off-by: Edward Cree Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 68568d22d6bd..f4ff0c569e54 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -263,6 +263,13 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +static char slot_type_char[] = { + [STACK_INVALID] = '?', + [STACK_SPILL] = 'r', + [STACK_MISC] = 'm', + [STACK_ZERO] = '0', +}; + static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { @@ -349,15 +356,26 @@ static void print_verifier_state(struct bpf_verifier_env *env, } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) { - verbose(env, " fp%d", - (-i - 1) * BPF_REG_SIZE); - print_liveness(env, state->stack[i].spilled_ptr.live); + char types_buf[BPF_REG_SIZE + 1]; + bool valid = false; + int j; + + for (j = 0; j < BPF_REG_SIZE; j++) { + if (state->stack[i].slot_type[j] != STACK_INVALID) + valid = true; + types_buf[j] = slot_type_char[ + state->stack[i].slot_type[j]]; + } + types_buf[BPF_REG_SIZE] = 0; + if (!valid) + continue; + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, state->stack[i].spilled_ptr.live); + if (state->stack[i].slot_type[0] == STACK_SPILL) verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); - } - if (state->stack[i].slot_type[0] == STACK_ZERO) - verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); + else + verbose(env, "=%s", types_buf); } verbose(env, "\n"); } -- cgit v1.2.3 From c7b27c37af3da5a63f32b0bc99569e3069e4d9c1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 29 Aug 2018 14:43:13 -0700 Subject: bpf: add bpffs pretty print for percpu arraymap/hash/lru_hash Added bpffs pretty print for percpu arraymap, percpu hashmap and percpu lru hashmap. For each map pair, the format is: : { cpu0: cpu1: ... cpun: } For example, on my VM, there are 4 cpus, and for test_btf test in the next patch: cat /sys/fs/bpf/pprint_test_percpu_hash You may get: ... 43602: { cpu0: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} cpu1: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} cpu2: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} cpu3: {43602,0,-43602,0x3,0xaa52,0x3,{43602|[82,170,0,0,0,0,0,0]},ENUM_TWO} } 72847: { cpu0: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} cpu1: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} cpu2: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} cpu3: {72847,0,-72847,0x3,0x11c8f,0x3,{72847|[143,28,1,0,0,0,0,0]},ENUM_THREE} } ... Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 24 ++++++++++++++++++++++++ kernel/bpf/hashtab.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 0c17aab3ce5f..f9d24121be99 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -358,6 +358,29 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } +static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu; + + rcu_read_lock(); + + seq_printf(m, "%u: {\n", *(u32 *)key); + pptr = array->pptrs[index & array->index_mask]; + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(pptr, cpu), m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + + rcu_read_unlock(); +} + static int array_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) @@ -398,6 +421,7 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_lookup_elem = percpu_array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_seq_show_elem = percpu_array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 03cc59ee9c95..2c1790288138 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1285,6 +1285,35 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, return ret; } +static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + struct htab_elem *l; + void __percpu *pptr; + int cpu; + + rcu_read_lock(); + + l = __htab_map_lookup_elem(map, key); + if (!l) { + rcu_read_unlock(); + return; + } + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": {\n"); + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + seq_printf(m, "\tcpu%d: ", cpu); + btf_type_seq_show(map->btf, map->btf_value_type_id, + per_cpu_ptr(pptr, cpu), m); + seq_puts(m, "\n"); + } + seq_puts(m, "}\n"); + + rcu_read_unlock(); +} + const struct bpf_map_ops htab_percpu_map_ops = { .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, @@ -1293,6 +1322,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_lookup_elem = htab_percpu_map_lookup_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_seq_show_elem = htab_percpu_map_seq_show_elem, }; const struct bpf_map_ops htab_lru_percpu_map_ops = { @@ -1303,6 +1333,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_lookup_elem = htab_lru_percpu_map_lookup_elem, .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_seq_show_elem = htab_percpu_map_seq_show_elem, }; static int fd_htab_map_alloc_check(union bpf_attr *attr) -- cgit v1.2.3 From a9c676bc8fc58d00eea9836fb14ee43c0346416a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 4 Sep 2018 19:13:44 -0700 Subject: bpf/verifier: fix verifier instability Edward Cree says: In check_mem_access(), for the PTR_TO_CTX case, after check_ctx_access() has supplied a reg_type, the other members of the register state are set appropriately. Previously reg.range was set to 0, but as it is in a union with reg.map_ptr, which is larger, upper bytes of the latter were left in place. This then caused the memcmp() in regsafe() to fail, preventing some branches from being pruned (and occasionally causing the same program to take a varying number of processed insns on repeated verifier runs). Fix the instability by clearing bpf_reg_state in __mark_reg_[un]known() Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Debugged-by: Edward Cree Acked-by: Edward Cree Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f4ff0c569e54..6ff1bac1795d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -570,7 +570,9 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg); */ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) { - reg->id = 0; + /* Clear id, off, and union(map_ptr, range) */ + memset(((u8 *)reg) + sizeof(reg->type), 0, + offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); reg->var_off = tnum_const(imm); reg->smin_value = (s64)imm; reg->smax_value = (s64)imm; @@ -589,7 +591,6 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) static void __mark_reg_const_zero(struct bpf_reg_state *reg) { __mark_reg_known(reg, 0); - reg->off = 0; reg->type = SCALAR_VALUE; } @@ -700,9 +701,12 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) /* Mark a register as having a completely unknown (scalar) value. */ static void __mark_reg_unknown(struct bpf_reg_state *reg) { + /* + * Clear type, id, off, and union(map_ptr, range) and + * padding between 'type' and union + */ + memset(reg, 0, offsetof(struct bpf_reg_state, var_off)); reg->type = SCALAR_VALUE; - reg->id = 0; - reg->off = 0; reg->var_off = tnum_unknown; reg->frameno = 0; __mark_reg_unbounded(reg); @@ -1640,9 +1644,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn else mark_reg_known_zero(env, regs, value_regno); - regs[value_regno].id = 0; - regs[value_regno].off = 0; - regs[value_regno].range = 0; regs[value_regno].type = reg_type; } @@ -2495,7 +2496,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].off = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() -- cgit v1.2.3 From 0d42d73a37ff91028785e42a6bf12fc020a277c1 Mon Sep 17 00:00:00 2001 From: Igor Stoppa Date: Wed, 5 Sep 2018 23:34:43 +0300 Subject: seccomp: remove unnecessary unlikely() WARN_ON() already contains an unlikely(), so it's not necessary to wrap it into another. Signed-off-by: Igor Stoppa Acked-by: Kees Cook Cc: linux-security-module@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: James Morris --- kernel/seccomp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index fd023ac24e10..5a2a9af4663e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -195,7 +195,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, READ_ONCE(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ - if (unlikely(WARN_ON(f == NULL))) + if (WARN_ON(f == NULL)) return SECCOMP_RET_KILL_PROCESS; if (!sd) { @@ -297,7 +297,7 @@ static inline pid_t seccomp_can_sync_threads(void) /* Return the first thread that cannot be synchronized. */ failed = task_pid_vnr(thread); /* If the pid cannot be resolved, then return -ESRCH */ - if (unlikely(WARN_ON(failed == 0))) + if (WARN_ON(failed == 0)) failed = -ESRCH; return failed; } -- cgit v1.2.3 From 86989c41b5ea08776c450cb759592532314a4ed6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 19 Jul 2018 19:47:27 -0500 Subject: signal: Always ignore SIGKILL and SIGSTOP sent to the global init If the first process started (aka /sbin/init) receives a SIGKILL it will panic the system if it is delivered. Making the system unusable and undebugable. It isn't much better if the first process started receives SIGSTOP. So always ignore SIGSTOP and SIGKILL sent to init. This is done in a separate clause in sig_task_ignored as force_sig_info can clear SIG_UNKILLABLE and this protection should work even then. Reviewed-by: Thomas Gleixner Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 5843c541fda9..b33264bb2064 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -78,6 +78,10 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force) handler = sig_handler(t, sig); + /* SIGKILL and SIGSTOP may not be sent to the global init */ + if (unlikely(is_global_init(t) && sig_kernel_only(sig))) + return true; + if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && handler == SIG_DFL && !(force && sig_kernel_only(sig))) return true; -- cgit v1.2.3 From 55a3235fc71bf34303e34a95eeee235b2d2a35dd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 19 Jul 2018 20:33:53 -0500 Subject: signal: Properly deliver SIGILL from uprobes For userspace to tell the difference between a random signal and an exception, the exception must include siginfo information. Using SEND_SIG_FORCED for SIGILL is thus wrong, and it will result in userspace seeing si_code == SI_USER (like a random signal) instead of si_code == SI_KERNEL or a more specific si_code as all exceptions deliver. Therefore replace force_sig_info(SIGILL, SEND_SIG_FORCE, current) with force_sig(SIG_ILL, current) which gets this right and is shorter and easier to type. Fixes: 014940bad8e4 ("uprobes/x86: Send SIGILL if arch_uprobe_post_xol() fails") Fixes: 0b5256c7f173 ("uprobes: Send SIGILL if handle_trampoline() fails") Reviewed-by: Thomas Gleixner Signed-off-by: "Eric W. Biederman" --- kernel/events/uprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3207a4d26849..2bf792d22087 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1858,7 +1858,7 @@ static void handle_trampoline(struct pt_regs *regs) sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); - force_sig_info(SIGILL, SEND_SIG_FORCED, current); + force_sig(SIGILL, current); } @@ -1966,7 +1966,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); - force_sig_info(SIGILL, SEND_SIG_FORCED, current); + force_sig(SIGILL, current); } } -- cgit v1.2.3 From 3597dfe01d12f570bc739da67f857fd222a3ea66 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 3 Sep 2018 20:02:46 +0200 Subject: signal: Always deliver the kernel's SIGKILL and SIGSTOP to a pid namespace init Instead of playing whack-a-mole and changing SEND_SIG_PRIV to SEND_SIG_FORCED throughout the kernel to ensure a pid namespace init gets signals sent by the kernel, stop allowing a pid namespace init to ignore SIGKILL or SIGSTOP sent by the kernel. A pid namespace init is only supposed to be able to ignore signals sent from itself and children with SIG_DFL. Fixes: 921cf9f63089 ("signals: protect cinit from unblocked SIG_DFL signals") Reviewed-by: Thomas Gleixner Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index b33264bb2064..8081ab79e97d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1039,7 +1039,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, - from_ancestor_ns || (info == SEND_SIG_FORCED))) + from_ancestor_ns || (info == SEND_SIG_PRIV) || (info == SEND_SIG_FORCED))) goto ret; pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; -- cgit v1.2.3 From 035150540545f62bada95860ba00fe1e0cd62f63 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Jul 2018 05:31:53 -0500 Subject: signal: Don't send siginfo to kthreads. Today kernel threads never dequeue siginfo so it is pointless to enqueue siginfo for them. The usb gadget mass storage driver goes one farther and uses SEND_SIG_FORCED to guarantee that no siginfo is even enqueued. Generalize the optimization of the usb mass storage driver and never perform an unnecessary allocation when delivering signals to kthreads. Switch the mass storage driver from sending signals with SEND_SIG_FORCED to SEND_SIG_PRIV. As using SEND_SIG_FORCED is now unnecessary. Reviewed-by: Thomas Gleixner Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 8081ab79e97d..20931a892ace 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1057,7 +1057,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, * fast-pathed signals for kernel-internal things like SIGSTOP * or SIGKILL. */ - if (info == SEND_SIG_FORCED) + if ((info == SEND_SIG_FORCED) || (t->flags & PF_KTHREAD)) goto out_set; /* -- cgit v1.2.3 From f149b31557446aff9ca96d4be7e39cc266f6e7cc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 3 Sep 2018 09:50:36 +0200 Subject: signal: Never allocate siginfo for SIGKILL or SIGSTOP The SIGKILL and SIGSTOP signals are never delivered to userspace so queued siginfo for these signals can never be observed. Therefore remove the chance of failure by never even attempting to allocate siginfo in those cases. Reviewed-by: Thomas Gleixner Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 20931a892ace..d7d1adf735f4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1054,10 +1054,11 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, result = TRACE_SIGNAL_DELIVERED; /* - * fast-pathed signals for kernel-internal things like SIGSTOP - * or SIGKILL. + * Skip useless siginfo allocation for SIGKILL SIGSTOP, + * and kernel threads. */ - if ((info == SEND_SIG_FORCED) || (t->flags & PF_KTHREAD)) + if ((info == SEND_SIG_FORCED) || + sig_kernel_only(sig) || (t->flags & PF_KTHREAD)) goto out_set; /* -- cgit v1.2.3 From 079b22dc9be985c591589fcb94769b8e13518aa0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 3 Sep 2018 10:32:52 +0200 Subject: signal: Use SEND_SIG_PRIV not SEND_SIG_FORCED with SIGKILL and SIGSTOP Now that siginfo is never allocated for SIGKILL and SIGSTOP there is no difference between SEND_SIG_PRIV and SEND_SIG_FORCED for SIGKILL and SIGSTOP. This makes SEND_SIG_FORCED unnecessary and redundant in the presence of SIGKILL and SIGSTOP. Therefore change users of SEND_SIG_FORCED that are sending SIGKILL or SIGSTOP to use SEND_SIG_PRIV instead. This removes the last users of SEND_SIG_FORCED. Reviewed-by: Thomas Gleixner Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 2 +- kernel/ptrace.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 2a2ac53d8b8b..c8d53397bbdd 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -216,7 +216,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { task = pid_task(pid, PIDTYPE_PID); if (task && !__fatal_signal_pending(task)) - send_sig_info(SIGKILL, SEND_SIG_FORCED, task); + send_sig_info(SIGKILL, SEND_SIG_PRIV, task); } read_unlock(&tasklist_lock); rcu_read_unlock(); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 21fec73d45d4..45f77a1b9c97 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -396,7 +396,7 @@ static int ptrace_attach(struct task_struct *task, long request, /* SEIZE doesn't trap tracee on attach */ if (!seize) - send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); + send_sig_info(SIGSTOP, SEND_SIG_PRIV, task); spin_lock(&task->sighand->siglock); @@ -563,7 +563,7 @@ void exit_ptrace(struct task_struct *tracer, struct list_head *dead) list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { if (unlikely(p->ptrace & PT_EXITKILL)) - send_sig_info(SIGKILL, SEND_SIG_FORCED, p); + send_sig_info(SIGKILL, SEND_SIG_PRIV, p); if (__ptrace_detach(tracer, p)) list_add(&p->ptrace_entry, dead); -- cgit v1.2.3 From 4ff4c31a6e85f4c49fbeebeaa28018d002884b5a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 3 Sep 2018 10:39:04 +0200 Subject: signal: Remove SEND_SIG_FORCED There are no more users of SEND_SIG_FORCED so it may be safely removed. Remove the definition of SEND_SIG_FORCED, it's use in is_si_special, it's use in TP_STORE_SIGINFO, and it's use in __send_signal as without any users the uses of SEND_SIG_FORCED are now unncessary. This makes the code simpler, easier to understand and use. Users of signal sending functions now no longer need to ask themselves do I need to use SEND_SIG_FORCED. Reviewed-by: Thomas Gleixner Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d7d1adf735f4..ec136fda457a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -736,7 +736,7 @@ static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) static inline int is_si_special(const struct siginfo *info) { - return info <= SEND_SIG_FORCED; + return info <= SEND_SIG_PRIV; } static inline bool si_fromuser(const struct siginfo *info) @@ -1039,7 +1039,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, - from_ancestor_ns || (info == SEND_SIG_PRIV) || (info == SEND_SIG_FORCED))) + from_ancestor_ns || (info == SEND_SIG_PRIV))) goto ret; pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; @@ -1057,8 +1057,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, * Skip useless siginfo allocation for SIGKILL SIGSTOP, * and kernel threads. */ - if ((info == SEND_SIG_FORCED) || - sig_kernel_only(sig) || (t->flags & PF_KTHREAD)) + if (sig_kernel_only(sig) || (t->flags & PF_KTHREAD)) goto out_set; /* -- cgit v1.2.3 From a7c19db38d62fc1ce797dba19936e9f81cf2b9fb Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 6 Sep 2018 17:26:04 -0700 Subject: bpf: add bpffs pretty print for program array map Added bpffs pretty print for program array map. For a particular array index, if the program array points to a valid program, the ": " will be printed out like 0: 6 which means bpf program with id "6" is installed at index "0". Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f9d24121be99..dded84cbe814 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -553,6 +553,29 @@ static void bpf_fd_array_map_clear(struct bpf_map *map) fd_array_map_delete_elem(map, &i); } +static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void **elem, *ptr; + u32 prog_id; + + rcu_read_lock(); + + elem = array_map_lookup_elem(map, key); + if (elem) { + ptr = READ_ONCE(*elem); + if (ptr) { + seq_printf(m, "%u: ", *(u32 *)key); + prog_id = prog_fd_array_sys_lookup_elem(ptr); + btf_type_seq_show(map->btf, map->btf_value_type_id, + &prog_id, m); + seq_puts(m, "\n"); + } + } + + rcu_read_unlock(); +} + const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -564,7 +587,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, - .map_check_btf = map_check_no_btf, + .map_seq_show_elem = prog_array_map_seq_show_elem, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, -- cgit v1.2.3 From d58e468b1112dcd1d5193c0a89ff9f98b5a3e8b9 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 14 Sep 2018 07:46:18 -0700 Subject: flow_dissector: implements flow dissector BPF hook Adds a hook for programs of type BPF_PROG_TYPE_FLOW_DISSECTOR and attach type BPF_FLOW_DISSECTOR that is executed in the flow dissector path. The BPF program is per-network namespace. Signed-off-by: Petar Penkov Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 8 ++++++++ kernel/bpf/verifier.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3c9636f03bb2..b3c2d09bcf7a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1615,6 +1615,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_LIRC_MODE2: ptype = BPF_PROG_TYPE_LIRC_MODE2; break; + case BPF_FLOW_DISSECTOR: + ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; + break; default: return -EINVAL; } @@ -1636,6 +1639,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_attach(attr, prog); break; + case BPF_PROG_TYPE_FLOW_DISSECTOR: + ret = skb_flow_dissector_bpf_prog_attach(attr, prog); + break; default: ret = cgroup_bpf_prog_attach(attr, ptype, prog); } @@ -1688,6 +1694,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); + case BPF_FLOW_DISSECTOR: + return skb_flow_dissector_bpf_prog_detach(attr); default: return -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6ff1bac1795d..8ccbff4fff93 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -261,6 +261,7 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET] = "pkt", [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", }; static char slot_type_char[] = { @@ -965,6 +966,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: + case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: return true; default: @@ -1238,6 +1240,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_FLOW_DISSECTOR: if (meta) return meta->pkt_access; @@ -1321,6 +1324,18 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } +static int check_flow_keys_access(struct bpf_verifier_env *env, int off, + int size) +{ + if (size < 0 || off < 0 || + (u64)off + size > sizeof(struct bpf_flow_keys)) { + verbose(env, "invalid access to flow keys off=%d size=%d\n", + off, size); + return -EACCES; + } + return 0; +} + static bool __is_pointer_value(bool allow_ptr_leaks, const struct bpf_reg_state *reg) { @@ -1422,6 +1437,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, * right in front, treat it the very same way. */ return check_pkt_ptr_alignment(env, reg, off, size, strict); + case PTR_TO_FLOW_KEYS: + pointer_desc = "flow keys "; + break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -1692,6 +1710,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_packet_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_FLOW_KEYS) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into flow keys\n", + value_regno); + return -EACCES; + } + + err = check_flow_keys_access(env, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -1839,6 +1868,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_FLOW_KEYS: + return check_flow_keys_access(env, reg->off, access_size); case PTR_TO_MAP_VALUE: return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); @@ -4366,6 +4397,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_CTX: case CONST_PTR_TO_MAP: case PTR_TO_PACKET_END: + case PTR_TO_FLOW_KEYS: /* Only valid matches are exact, which memcmp() above * would have accepted */ -- cgit v1.2.3 From 82058d6684658430cd9b4123d4c3e863fd48f813 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Jul 2018 16:35:14 -0500 Subject: signal: Use group_send_sig_info to kill all processes in a pid namespace Replace send_sig_info in zap_pid_ns_processes with group_send_sig_info. This makes more sense as the entire process group is being killed. More importantly this allows the kill of those processes with PIDTYPE_MAX to indicate all of the process in the pid namespace are being signaled. This is needed for fork to detect when signals are sent to a group of processes. Admittedly fork has another case to catch SIGKILL but the principle remains that it is desirable to know when a group of processes is being signaled. Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index c8d53397bbdd..aa6e72fb7c08 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -216,7 +216,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { task = pid_task(pid, PIDTYPE_PID); if (task && !__fatal_signal_pending(task)) - send_sig_info(SIGKILL, SEND_SIG_PRIV, task); + group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX); } read_unlock(&tasklist_lock); rcu_read_unlock(); -- cgit v1.2.3 From b21c5bd562dd97ac0b936439fc64bd30ec09b2e0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 21 Jul 2018 11:34:03 -0500 Subject: signal: Remove specific_send_sig_info This function is static and it only has two callers. As specific_send_sig_info is only called twice remembering what specific_send_sig_info does when reading the code is difficutl and it makes it hard to see which sending sending functions are equivalent to which others. So remove specific_send_sig_info to make the code easier to read. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ec136fda457a..99e91163c9a3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1199,12 +1199,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) return send_signal(sig, info, p, PIDTYPE_TGID); } -static int -specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) -{ - return send_signal(sig, info, t, PIDTYPE_PID); -} - int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, enum pid_type type) { @@ -1254,7 +1248,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) */ if (action->sa.sa_handler == SIG_DFL && !t->ptrace) t->signal->flags &= ~SIGNAL_UNKILLABLE; - ret = specific_send_sig_info(sig, info, t); + ret = send_signal(sig, info, t, PIDTYPE_PID); spin_unlock_irqrestore(&t->sighand->siglock, flags); return ret; @@ -2330,7 +2324,7 @@ static int ptrace_signal(int signr, siginfo_t *info) /* If the (new) signal is now blocked, requeue it. */ if (sigismember(¤t->blocked, signr)) { - specific_send_sig_info(signr, info, current); + send_signal(signr, info, current, PIDTYPE_PID); signr = 0; } -- cgit v1.2.3 From fb50f5a4011c499bc1b1fae77299cfcb3945e51b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 13 Sep 2018 19:26:35 +0200 Subject: signal: Pair exports with their functions For readability and consistency with the other exports in kernel/signal.c pair the exports of signal sending functions with their functions, instead of having the exports in one big clump. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 99e91163c9a3..e16278710b36 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -176,6 +176,7 @@ void recalc_sigpending(void) clear_thread_flag(TIF_SIGPENDING); } +EXPORT_SYMBOL(recalc_sigpending); void calculate_sigpending(void) { @@ -466,6 +467,7 @@ void flush_signals(struct task_struct *t) flush_sigqueue(&t->signal->shared_pending); spin_unlock_irqrestore(&t->sighand->siglock, flags); } +EXPORT_SYMBOL(flush_signals); #ifdef CONFIG_POSIX_TIMERS static void __flush_itimer_signals(struct sigpending *pending) @@ -684,6 +686,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) #endif return signr; } +EXPORT_SYMBOL_GPL(dequeue_signal); /* * Tell a process that it has a new active signal.. @@ -1490,6 +1493,7 @@ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) return do_send_sig_info(sig, info, p, PIDTYPE_PID); } +EXPORT_SYMBOL(send_sig_info); #define __si_special(priv) \ ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) @@ -1499,11 +1503,13 @@ send_sig(int sig, struct task_struct *p, int priv) { return send_sig_info(sig, __si_special(priv), p); } +EXPORT_SYMBOL(send_sig); void force_sig(int sig, struct task_struct *p) { force_sig_info(sig, SEND_SIG_PRIV, p); } +EXPORT_SYMBOL(force_sig); /* * When things go south during signal handling, we @@ -2634,14 +2640,6 @@ out: } } -EXPORT_SYMBOL(recalc_sigpending); -EXPORT_SYMBOL_GPL(dequeue_signal); -EXPORT_SYMBOL(flush_signals); -EXPORT_SYMBOL(force_sig); -EXPORT_SYMBOL(send_sig); -EXPORT_SYMBOL(send_sig_info); -EXPORT_SYMBOL(sigprocmask); - /* * System call entry points. */ @@ -2735,6 +2733,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) __set_current_blocked(&newset); return 0; } +EXPORT_SYMBOL(sigprocmask); /** * sys_rt_sigprocmask - change the list of currently blocked signals -- cgit v1.2.3 From 788758d1fe874fd20ecb0ab490552d94c024a9de Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Thu, 20 Sep 2018 17:46:12 +0800 Subject: bpf: remove redundant null pointer check before consume_skb consume_skb has taken the null pointer into account. hence it is safe to remove the redundant null pointer check before consume_skb. Signed-off-by: zhong jiang Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 488ef9663c01..a9359cbc3f93 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -590,8 +590,7 @@ static int free_sg(struct sock *sk, int start, if (i == MAX_SKB_FRAGS) i = 0; } - if (md->skb) - consume_skb(md->skb); + consume_skb(md->skb); return free; } @@ -973,8 +972,7 @@ bytes_ready: if (!sg->length && md->sg_start == md->sg_end) { list_del(&md->list); - if (md->skb) - consume_skb(md->skb); + consume_skb(md->skb); kfree(md); } } -- cgit v1.2.3 From 5bf7a60b8e70969f65c961d7e2c4eb40eb2c664d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 27 Sep 2018 14:37:30 -0700 Subject: bpf: permit CGROUP_DEVICE programs accessing helper bpf_get_current_cgroup_id() Currently, helper bpf_get_current_cgroup_id() is not permitted for CGROUP_DEVICE type of programs. If the helper is used in such cases, the verifier will log the following error: 0: (bf) r6 = r1 1: (69) r7 = *(u16 *)(r6 +0) 2: (85) call bpf_get_current_cgroup_id#80 unknown func bpf_get_current_cgroup_id#80 The bpf_get_current_cgroup_id() is useful for CGROUP_DEVICE type of programs in order to customize action based on cgroup id. This patch added such a support. Cc: Roman Gushchin Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6a7d931bbc55..549f6fbcc461 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -677,6 +677,8 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); -- cgit v1.2.3 From dc6253108f0fbff4a634055d5b8a91958ec2af81 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Fri, 28 Sep 2018 12:03:28 -0400 Subject: tick/broadcast: Remove redundant check tick_device_is_functional() is called early in tick_broadcast_control(), so no need to call it again later. Signed-off-by: Peng Hao Signed-off-by: Thomas Gleixner Cc: fweisbec@gmail.com Link: https://lkml.kernel.org/r/1538150608-2599-1-git-send-email-penghao122@sina.com.cn --- kernel/time/tick-broadcast.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index aa2094d5dd27..be0aac2b4300 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -400,8 +400,6 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) if (tick_broadcast_forced) break; cpumask_clear_cpu(cpu, tick_broadcast_on); - if (!tick_device_is_functional(dev)) - break; if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) -- cgit v1.2.3 From 513145ea66af95f1a5c744d7b5a4f4a97625e669 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 1 Oct 2018 11:05:21 +0100 Subject: genirq/debugfs: Reset domain debugfs_file on removal of the debugfs file When removing a debugfs file for a given irq domain, we fail to clear the corresponding field, meaning that the corresponding domain won't be created again if we need to do so. It turns out that this is exactly what irq_domain_update_bus_token does (delete old file, update domain name, recreate file). This doesn't have any impact other than making debug more difficult, but we do value ease of debugging... So clear the debugfs_file field. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20181001100522.180054-2-marc.zyngier@arm.com --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 5d9fc01b60a6..95a0acbdd4e6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1741,6 +1741,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d) static void debugfs_remove_domain_dir(struct irq_domain *d) { debugfs_remove(d->debugfs_file); + d->debugfs_file = NULL; } void __init irq_domain_debugfs_init(struct dentry *root) -- cgit v1.2.3 From 94967b55ebf3b603f2fe750ecedd896042585a1c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 1 Oct 2018 11:05:22 +0100 Subject: genirq/debugfs: Reinstate full OF path for domain name On a DT based system, we use the of_node full name to name the corresponding irq domain. We expect that name to be unique, so so that domains with the same base name won't clash (this happens on multi-node topologies, for example). Since a7e4cfb0a7ca ("of/fdt: only store the device node basename in full_name"), of_node_full_name() lies and only returns the basename. This breaks the above requirement, and we end-up with only a subset of the domains in /sys/kernel/debug/irq/domains. Let's reinstate the feature by using the fancy new %pOF format specifier, which happens to do the right thing. Fixes: a7e4cfb0a7ca ("of/fdt: only store the device node basename in full_name") Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20181001100522.180054-3-marc.zyngier@arm.com --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 95a0acbdd4e6..3b30a4aeb0db 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -183,7 +183,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, * unhappy about. Replace them with ':', which does * the trick and is not as offensive as '\'... */ - name = kstrdup(of_node_full_name(of_node), GFP_KERNEL); + name = kasprintf(GFP_KERNEL, "%pOF", of_node); if (!name) { kfree(domain); return NULL; -- cgit v1.2.3 From 8bad74f9840f87661f20ced3dc80c84ab4fd55a1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:45:36 +0000 Subject: bpf: extend cgroup bpf core to allow multiple cgroup storage types In order to introduce per-cpu cgroup storage, let's generalize bpf cgroup core to support multiple cgroup storage types. Potentially, per-node cgroup storage can be added later. This commit is mostly a formal change that replaces cgroup_storage pointer with a array of cgroup_storage pointers. It doesn't actually introduce a new storage type, it will be done later. Each bpf program is now able to have one cgroup storage of each type. Signed-off-by: Roman Gushchin Acked-by: Song Liu Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/cgroup.c | 74 ++++++++++++++++++++++++++++++++-------------- kernel/bpf/helpers.c | 15 ++++++---- kernel/bpf/local_storage.c | 18 ++++++----- kernel/bpf/syscall.c | 9 ++++-- kernel/bpf/verifier.c | 8 +++-- 5 files changed, 85 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 549f6fbcc461..00f6ed2e4f9a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -25,6 +25,7 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key); */ void cgroup_bpf_put(struct cgroup *cgrp) { + enum bpf_cgroup_storage_type stype; unsigned int type; for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { @@ -34,8 +35,10 @@ void cgroup_bpf_put(struct cgroup *cgrp) list_for_each_entry_safe(pl, tmp, progs, node) { list_del(&pl->node); bpf_prog_put(pl->prog); - bpf_cgroup_storage_unlink(pl->storage); - bpf_cgroup_storage_free(pl->storage); + for_each_cgroup_storage_type(stype) { + bpf_cgroup_storage_unlink(pl->storage[stype]); + bpf_cgroup_storage_free(pl->storage[stype]); + } kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } @@ -97,6 +100,7 @@ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, struct bpf_prog_array __rcu **array) { + enum bpf_cgroup_storage_type stype; struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; @@ -125,7 +129,9 @@ static int compute_effective_progs(struct cgroup *cgrp, continue; progs->items[cnt].prog = pl->prog; - progs->items[cnt].cgroup_storage = pl->storage; + for_each_cgroup_storage_type(stype) + progs->items[cnt].cgroup_storage[stype] = + pl->storage[stype]; cnt++; } } while ((p = cgroup_parent(p))); @@ -232,7 +238,9 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog *old_prog = NULL; - struct bpf_cgroup_storage *storage, *old_storage = NULL; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], + *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; + enum bpf_cgroup_storage_type stype; struct bpf_prog_list *pl; bool pl_was_allocated; int err; @@ -254,34 +262,44 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; - storage = bpf_cgroup_storage_alloc(prog); - if (IS_ERR(storage)) - return -ENOMEM; + for_each_cgroup_storage_type(stype) { + storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(storage[stype])) { + storage[stype] = NULL; + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); + return -ENOMEM; + } + } if (flags & BPF_F_ALLOW_MULTI) { list_for_each_entry(pl, progs, node) { if (pl->prog == prog) { /* disallow attaching the same prog twice */ - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return -EINVAL; } } pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return -ENOMEM; } pl_was_allocated = true; pl->prog = prog; - pl->storage = storage; + for_each_cgroup_storage_type(stype) + pl->storage[stype] = storage[stype]; list_add_tail(&pl->node, progs); } else { if (list_empty(progs)) { pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return -ENOMEM; } pl_was_allocated = true; @@ -289,12 +307,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, } else { pl = list_first_entry(progs, typeof(*pl), node); old_prog = pl->prog; - old_storage = pl->storage; - bpf_cgroup_storage_unlink(old_storage); + for_each_cgroup_storage_type(stype) { + old_storage[stype] = pl->storage[stype]; + bpf_cgroup_storage_unlink(old_storage[stype]); + } pl_was_allocated = false; } pl->prog = prog; - pl->storage = storage; + for_each_cgroup_storage_type(stype) + pl->storage[stype] = storage[stype]; } cgrp->bpf.flags[type] = flags; @@ -304,21 +325,27 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, goto cleanup; static_branch_inc(&cgroup_bpf_enabled_key); - if (old_storage) - bpf_cgroup_storage_free(old_storage); + for_each_cgroup_storage_type(stype) { + if (!old_storage[stype]) + continue; + bpf_cgroup_storage_free(old_storage[stype]); + } if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } - bpf_cgroup_storage_link(storage, cgrp, type); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_link(storage[stype], cgrp, type); return 0; cleanup: /* and cleanup the prog list */ pl->prog = old_prog; - bpf_cgroup_storage_free(pl->storage); - pl->storage = old_storage; - bpf_cgroup_storage_link(old_storage, cgrp, type); + for_each_cgroup_storage_type(stype) { + bpf_cgroup_storage_free(pl->storage[stype]); + pl->storage[stype] = old_storage[stype]; + bpf_cgroup_storage_link(old_storage[stype], cgrp, type); + } if (pl_was_allocated) { list_del(&pl->node); kfree(pl); @@ -339,6 +366,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 unused_flags) { struct list_head *progs = &cgrp->bpf.progs[type]; + enum bpf_cgroup_storage_type stype; u32 flags = cgrp->bpf.flags[type]; struct bpf_prog *old_prog = NULL; struct bpf_prog_list *pl; @@ -385,8 +413,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ list_del(&pl->node); - bpf_cgroup_storage_unlink(pl->storage); - bpf_cgroup_storage_free(pl->storage); + for_each_cgroup_storage_type(stype) { + bpf_cgroup_storage_unlink(pl->storage[stype]); + bpf_cgroup_storage_free(pl->storage[stype]); + } kfree(pl); if (list_empty(progs)) /* last program was detached, reset flags to zero */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1991466b8327..9070b2ace6aa 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -194,16 +194,18 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { .ret_type = RET_INTEGER, }; -DECLARE_PER_CPU(void*, bpf_cgroup_storage); +#ifdef CONFIG_CGROUP_BPF +DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { - /* map and flags arguments are not used now, - * but provide an ability to extend the API - * for other types of local storages. - * verifier checks that their values are correct. + /* flags argument is not used now, + * but provides an ability to extend the API. + * verifier checks that its value is correct. */ - return (unsigned long) this_cpu_read(bpf_cgroup_storage); + enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + + return (unsigned long) this_cpu_read(bpf_cgroup_storage[stype]); } const struct bpf_func_proto bpf_get_local_storage_proto = { @@ -214,3 +216,4 @@ const struct bpf_func_proto bpf_get_local_storage_proto = { .arg2_type = ARG_ANYTHING, }; #endif +#endif diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 22ad967d1e5f..0bd9f19fc557 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,7 +7,7 @@ #include #include -DEFINE_PER_CPU(void*, bpf_cgroup_storage); +DEFINE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); #ifdef CONFIG_CGROUP_BPF @@ -251,6 +251,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = { int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) { + enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map); int ret = -EBUSY; @@ -258,11 +259,12 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) if (map->prog && map->prog != prog) goto unlock; - if (prog->aux->cgroup_storage && prog->aux->cgroup_storage != _map) + if (prog->aux->cgroup_storage[stype] && + prog->aux->cgroup_storage[stype] != _map) goto unlock; map->prog = prog; - prog->aux->cgroup_storage = _map; + prog->aux->cgroup_storage[stype] = _map; ret = 0; unlock: spin_unlock_bh(&map->lock); @@ -272,24 +274,26 @@ unlock: void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) { + enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); struct bpf_cgroup_storage_map *map = map_to_storage(_map); spin_lock_bh(&map->lock); if (map->prog == prog) { - WARN_ON(prog->aux->cgroup_storage != _map); + WARN_ON(prog->aux->cgroup_storage[stype] != _map); map->prog = NULL; - prog->aux->cgroup_storage = NULL; + prog->aux->cgroup_storage[stype] = NULL; } spin_unlock_bh(&map->lock); } -struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog) +struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, + enum bpf_cgroup_storage_type stype) { struct bpf_cgroup_storage *storage; struct bpf_map *map; u32 pages; - map = prog->aux->cgroup_storage; + map = prog->aux->cgroup_storage[stype]; if (!map) return NULL; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b3c2d09bcf7a..8c91d2b41b1e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -988,10 +988,15 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) /* drop refcnt on maps used by eBPF program and free auxilary data */ static void free_used_maps(struct bpf_prog_aux *aux) { + enum bpf_cgroup_storage_type stype; int i; - if (aux->cgroup_storage) - bpf_cgroup_storage_release(aux->prog, aux->cgroup_storage); + for_each_cgroup_storage_type(stype) { + if (!aux->cgroup_storage[stype]) + continue; + bpf_cgroup_storage_release(aux->prog, + aux->cgroup_storage[stype]); + } for (i = 0; i < aux->used_map_cnt; i++) bpf_map_put(aux->used_maps[i]); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e986518d7bc3..e90899df585d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5171,11 +5171,15 @@ next_insn: /* drop refcnt of maps used by the rejected program */ static void release_maps(struct bpf_verifier_env *env) { + enum bpf_cgroup_storage_type stype; int i; - if (env->prog->aux->cgroup_storage) + for_each_cgroup_storage_type(stype) { + if (!env->prog->aux->cgroup_storage[stype]) + continue; bpf_cgroup_storage_release(env->prog, - env->prog->aux->cgroup_storage); + env->prog->aux->cgroup_storage[stype]); + } for (i = 0; i < env->used_map_cnt; i++) bpf_map_put(env->used_maps[i]); -- cgit v1.2.3 From f294b37ec7b24a574884cd157497a3748081c0f0 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:45:40 +0000 Subject: bpf: rework cgroup storage pointer passing To simplify the following introduction of per-cpu cgroup storage, let's rework a bit a mechanism of passing a pointer to a cgroup storage into the bpf_get_local_storage(). Let's save a pointer to the corresponding bpf_cgroup_storage structure, instead of a pointer to the actual buffer. It will help us to handle per-cpu storage later, which has a different way of accessing to the actual data. Signed-off-by: Roman Gushchin Acked-by: Song Liu Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/helpers.c | 8 ++++++-- kernel/bpf/local_storage.c | 3 ++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9070b2ace6aa..e42f8789b7ea 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -195,7 +195,8 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { }; #ifdef CONFIG_CGROUP_BPF -DECLARE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +DECLARE_PER_CPU(struct bpf_cgroup_storage*, + bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { @@ -204,8 +205,11 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) * verifier checks that its value is correct. */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); + struct bpf_cgroup_storage *storage; - return (unsigned long) this_cpu_read(bpf_cgroup_storage[stype]); + storage = this_cpu_read(bpf_cgroup_storage[stype]); + + return (unsigned long)&READ_ONCE(storage->buf)->data[0]; } const struct bpf_func_proto bpf_get_local_storage_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 0bd9f19fc557..6742292fb39e 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -7,7 +7,8 @@ #include #include -DEFINE_PER_CPU(void*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +DEFINE_PER_CPU(struct bpf_cgroup_storage*, + bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); #ifdef CONFIG_CGROUP_BPF -- cgit v1.2.3 From b741f1630346defcbc8cc60f1a2bdae8b3b0036f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:45:43 +0000 Subject: bpf: introduce per-cpu cgroup local storage This commit introduced per-cpu cgroup local storage. Per-cpu cgroup local storage is very similar to simple cgroup storage (let's call it shared), except all the data is per-cpu. The main goal of per-cpu variant is to implement super fast counters (e.g. packet counters), which don't require neither lookups, neither atomic operations. >From userspace's point of view, accessing a per-cpu cgroup storage is similar to other per-cpu map types (e.g. per-cpu hashmaps and arrays). Writing to a per-cpu cgroup storage is not atomic, but is performed by copying longs, so some minimal atomicity is here, exactly as with other per-cpu maps. Signed-off-by: Roman Gushchin Cc: Daniel Borkmann Cc: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/helpers.c | 8 ++- kernel/bpf/local_storage.c | 150 +++++++++++++++++++++++++++++++++++++++------ kernel/bpf/syscall.c | 11 +++- kernel/bpf/verifier.c | 15 +++-- 4 files changed, 157 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e42f8789b7ea..6502115e8f55 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); struct bpf_cgroup_storage *storage; + void *ptr; storage = this_cpu_read(bpf_cgroup_storage[stype]); - return (unsigned long)&READ_ONCE(storage->buf)->data[0]; + if (stype == BPF_CGROUP_STORAGE_SHARED) + ptr = &READ_ONCE(storage->buf)->data[0]; + else + ptr = this_cpu_ptr(storage->percpu_buf); + + return (unsigned long)ptr; } const struct bpf_func_proto bpf_get_local_storage_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 6742292fb39e..944eb297465f 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return 0; } +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, + void *value) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu, off = 0; + u32 size; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map, key, false); + if (!storage) { + rcu_read_unlock(); + return -ENOENT; + } + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(_map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(storage->percpu_buf, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, + void *value, u64 map_flags) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu, off = 0; + u32 size; + + if (map_flags != BPF_ANY && map_flags != BPF_EXIST) + return -EINVAL; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map, key, false); + if (!storage) { + rcu_read_unlock(); + return -ENOENT; + } + + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(_map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), + value + off, size); + off += size; + } + rcu_read_unlock(); + return 0; +} + static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, void *_next_key) { @@ -287,60 +352,105 @@ void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) spin_unlock_bh(&map->lock); } +static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) +{ + size_t size; + + if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { + size = sizeof(struct bpf_storage_buffer) + map->value_size; + *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, + PAGE_SIZE) >> PAGE_SHIFT; + } else { + size = map->value_size; + *pages = round_up(round_up(size, 8) * num_possible_cpus(), + PAGE_SIZE) >> PAGE_SHIFT; + } + + return size; +} + struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { struct bpf_cgroup_storage *storage; struct bpf_map *map; + gfp_t flags; + size_t size; u32 pages; map = prog->aux->cgroup_storage[stype]; if (!map) return NULL; - pages = round_up(sizeof(struct bpf_cgroup_storage) + - sizeof(struct bpf_storage_buffer) + - map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + size = bpf_cgroup_storage_calculate_size(map, &pages); + if (bpf_map_charge_memlock(map, pages)) return ERR_PTR(-EPERM); storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), __GFP_ZERO | GFP_USER, map->numa_node); - if (!storage) { - bpf_map_uncharge_memlock(map, pages); - return ERR_PTR(-ENOMEM); - } + if (!storage) + goto enomem; - storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, __GFP_ZERO | GFP_USER, - map->numa_node); - if (!storage->buf) { - bpf_map_uncharge_memlock(map, pages); - kfree(storage); - return ERR_PTR(-ENOMEM); + flags = __GFP_ZERO | GFP_USER; + + if (stype == BPF_CGROUP_STORAGE_SHARED) { + storage->buf = kmalloc_node(size, flags, map->numa_node); + if (!storage->buf) + goto enomem; + } else { + storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); + if (!storage->percpu_buf) + goto enomem; } storage->map = (struct bpf_cgroup_storage_map *)map; return storage; + +enomem: + bpf_map_uncharge_memlock(map, pages); + kfree(storage); + return ERR_PTR(-ENOMEM); +} + +static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) +{ + struct bpf_cgroup_storage *storage = + container_of(rcu, struct bpf_cgroup_storage, rcu); + + kfree(storage->buf); + kfree(storage); +} + +static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) +{ + struct bpf_cgroup_storage *storage = + container_of(rcu, struct bpf_cgroup_storage, rcu); + + free_percpu(storage->percpu_buf); + kfree(storage); } void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) { - u32 pages; + enum bpf_cgroup_storage_type stype; struct bpf_map *map; + u32 pages; if (!storage) return; map = &storage->map->map; - pages = round_up(sizeof(struct bpf_cgroup_storage) + - sizeof(struct bpf_storage_buffer) + - map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + + bpf_cgroup_storage_calculate_size(map, &pages); bpf_map_uncharge_memlock(map, pages); - kfree_rcu(storage->buf, rcu); - kfree_rcu(storage, rcu); + stype = cgroup_storage_type(map); + if (stype == BPF_CGROUP_STORAGE_SHARED) + call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); + else + call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); } void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8c91d2b41b1e..5742df21598c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -686,7 +686,8 @@ static int map_lookup_elem(union bpf_attr *attr) if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) value_size = sizeof(u32); @@ -705,6 +706,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); } else if (IS_FD_ARRAY(map)) { @@ -774,7 +777,8 @@ static int map_update_elem(union bpf_attr *attr) if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else value_size = map->value_size; @@ -809,6 +813,9 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_update(map, key, value, + attr->flags); } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e90899df585d..a8cc83a970d1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2074,6 +2074,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_CGROUP_STORAGE: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: if (func_id != BPF_FUNC_get_local_storage) goto error; break; @@ -2164,7 +2165,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_get_local_storage: - if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) goto error; break; case BPF_FUNC_sk_select_reuseport: @@ -5049,6 +5051,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return 0; } +static bool bpf_map_is_cgroup_storage(struct bpf_map *map) +{ + return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ @@ -5139,10 +5147,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } env->used_maps[env->used_map_cnt++] = map; - if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && + if (bpf_map_is_cgroup_storage(map) && bpf_cgroup_storage_assign(env->prog, map)) { - verbose(env, - "only one cgroup storage is allowed\n"); + verbose(env, "only one cgroup storage of each type is allowed\n"); fdput(f); return -EBUSY; } -- cgit v1.2.3 From c6fdcd6e0cc4dc316e3eb261025fb0abd69540b9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:45:46 +0000 Subject: bpf: don't allow create maps of per-cpu cgroup local storages Explicitly forbid creating map of per-cpu cgroup local storages. This behavior matches the behavior of shared cgroup storages. Signed-off-by: Roman Gushchin Acked-by: Song Liu Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/map_in_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 3bfbf4464416..99d243e1ad6e 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -24,7 +24,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) * in the verifier is not enough. */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE) { + inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || + inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { fdput(f); return ERR_PTR(-ENOTSUPP); } -- cgit v1.2.3 From f92b070f2dc89a8ff1a0cc8b608e20abef894c7d Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 13 Sep 2018 14:34:06 +0200 Subject: printk: Do not miss new messages when replaying the log The variable "exclusive_console" is used to reply all existing messages on a newly registered console. It is cleared when all messages are out. The problem is that new messages might appear in the meantime. These are then visible only on the exclusive console. The obvious solution is to clear "exclusive_console" after we replay all messages that were already proceed before we started the reply. Reported-by: Sergey Senozhatsky Link: http://lkml.kernel.org/r/20180913123406.14378-1-pmladek@suse.com To: Steven Rostedt Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: linux-kernel@vger.kernel.org Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9bf5404397e0..cfaa211a8b54 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -423,6 +423,7 @@ static u32 log_next_idx; /* the next printk record to write to the console */ static u64 console_seq; static u32 console_idx; +static u64 exclusive_console_stop_seq; /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; @@ -2009,6 +2010,7 @@ static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; static u32 console_idx; +static u64 exclusive_console_stop_seq; static u64 log_first_seq; static u32 log_first_idx; static u64 log_next_seq; @@ -2376,6 +2378,12 @@ skip: goto skip; } + /* Output to all consoles once old messages replayed. */ + if (unlikely(exclusive_console && + console_seq >= exclusive_console_stop_seq)) { + exclusive_console = NULL; + } + len += msg_print_text(msg, console_msg_format & MSG_FORMAT_SYSLOG, text + len, @@ -2418,10 +2426,6 @@ skip: console_locked = 0; - /* Release the exclusive_console once it is used */ - if (unlikely(exclusive_console)) - exclusive_console = NULL; - raw_spin_unlock(&logbuf_lock); up_console_sem(); @@ -2706,6 +2710,7 @@ void register_console(struct console *newcon) * the already-registered consoles. */ exclusive_console = newcon; + exclusive_console_stop_seq = console_seq; } console_unlock(); console_sysfs_notify(); -- cgit v1.2.3 From 884e370ea88c109a3b982f4eb9ecd82510a3a1fe Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 28 Sep 2018 18:53:04 +0900 Subject: printk: CON_PRINTBUFFER console registration is a bit racy CON_PRINTBUFFER console registration requires us to do several preparation steps: - Rollback console_seq to replay logbuf messages which were already seen on other consoles; - Set exclusive_console flag so console_unlock() will ->write() logbuf messages only to the exclusive_console driver. The way we do it, however, is a bit racy logbuf_lock_irqsave(flags); console_seq = syslog_seq; console_idx = syslog_idx; logbuf_unlock_irqrestore(flags); << preemption enabled << irqs enabled exclusive_console = newcon; console_unlock(); We rollback console_seq under logbuf_lock with IRQs disabled, but we set exclusive_console with local IRQs enabled and logbuf unlocked. If the system oops-es or panic-s before we set exclusive_console - and given that we have IRQs and preemption enabled there is such a possibility - we will re-play all logbuf messages to every registered console, which may be a bit annoying and time consuming. Move exclusive_console assignment to the same IRQs-disabled and logbuf_lock-protected section where we rollback console_seq. Link: http://lkml.kernel.org/r/20180928095304.9972-1-sergey.senozhatsky@gmail.com To: Steven Rostedt Cc: Sergey Senozhatsky Cc: linux-kernel@vger.kernel.org Signed-off-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index cfaa211a8b54..c5b568c2d167 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2703,14 +2703,18 @@ void register_console(struct console *newcon) logbuf_lock_irqsave(flags); console_seq = syslog_seq; console_idx = syslog_idx; - logbuf_unlock_irqrestore(flags); /* * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to * the already-registered consoles. + * + * Set exclusive_console with disabled interrupts to reduce + * race window with eventual console_flush_on_panic() that + * ignores console_lock. */ exclusive_console = newcon; exclusive_console_stop_seq = console_seq; + logbuf_unlock_irqrestore(flags); } console_unlock(); console_sysfs_notify(); -- cgit v1.2.3 From f3709f69b7c5cba6323cc03c29b64293b93be817 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:29 -0700 Subject: bpf: Add iterator for spilled registers Add this iterator for spilled registers, it concentrates the details of how to get the current frame's spilled registers into a single macro while clarifying the intention of the code which is calling the macro. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a8cc83a970d1..9c82d8f58085 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2252,10 +2252,9 @@ static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, if (reg_is_pkt_pointer_any(®s[i])) mark_reg_unknown(env, regs, i); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) continue; - reg = &state->stack[i].spilled_ptr; if (reg_is_pkt_pointer_any(reg)) __mark_reg_unknown(reg); } @@ -3395,10 +3394,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, for (j = 0; j <= vstate->curframe; j++) { state = vstate->frame[j]; - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) continue; - reg = &state->stack[i].spilled_ptr; if (reg->type == type && reg->id == dst_reg->id) reg->range = max(reg->range, new_range); } @@ -3643,7 +3641,7 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg, *regs = state->regs; u32 id = regs[regno].id; int i, j; @@ -3652,8 +3650,8 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, for (j = 0; j <= vstate->curframe; j++) { state = vstate->frame[j]; - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) continue; mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); } -- cgit v1.2.3 From aad2eeaf46973a0968a75640cd1f8f1c650322a0 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:30 -0700 Subject: bpf: Simplify ptr_min_max_vals adjustment An upcoming commit will add another two pointer types that need very similar behaviour, so generalise this function now. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9c82d8f58085..abf567200574 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2669,20 +2669,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", - dst); - return -EACCES; - } - if (ptr_reg->type == CONST_PTR_TO_MAP) { - verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", - dst); + switch (ptr_reg->type) { + case PTR_TO_MAP_VALUE_OR_NULL: + verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", + dst, reg_type_str[ptr_reg->type]); return -EACCES; - } - if (ptr_reg->type == PTR_TO_PACKET_END) { - verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", - dst); + case CONST_PTR_TO_MAP: + case PTR_TO_PACKET_END: + verbose(env, "R%d pointer arithmetic on %s prohibited\n", + dst, reg_type_str[ptr_reg->type]); return -EACCES; + default: + break; } /* In case of 'scalar += pointer', dst_reg inherits pointer type and id. -- cgit v1.2.3 From 9d2be44a7f33d5ec4fbd3368317bcf5f404bb8f7 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:31 -0700 Subject: bpf: Reuse canonical string formatter for ctx errs The array "reg_type_str" provides canonical formatting of register types, however a couple of places would previously check whether a register represented the context and write the name "context" directly. An upcoming commit will add another pointer type to these statements, so to provide more accurate error messages in the verifier, update these error messages to use "reg_type_str" instead. Signed-off-by: Joe Stringer Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index abf567200574..8b4e70eeced2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1763,8 +1763,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", - insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ? - "context" : "packet"); + insn->dst_reg, reg_type_str[insn->dst_reg]); return -EACCES; } @@ -4871,8 +4870,8 @@ static int do_check(struct bpf_verifier_env *env) return err; if (is_ctx_reg(env, insn->dst_reg)) { - verbose(env, "BPF_ST stores into R%d context is not allowed\n", - insn->dst_reg); + verbose(env, "BPF_ST stores into R%d %s is not allowed\n", + insn->dst_reg, reg_type_str[insn->dst_reg]); return -EACCES; } -- cgit v1.2.3 From 840b9615d6e9d134178b4dd4f3c30aa30643a379 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:32 -0700 Subject: bpf: Generalize ptr_or_null regs check This check will be reused by an upcoming commit for conditional jump checks for sockets. Refactor it a bit to simplify the later commit. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8b4e70eeced2..98b218bd46e8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -249,6 +249,11 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) type == PTR_TO_PACKET_META; } +static bool reg_type_may_be_null(enum bpf_reg_type type) +{ + return type == PTR_TO_MAP_VALUE_OR_NULL; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -3599,12 +3604,10 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src, } } -static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, - bool is_null) +static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id, + bool is_null) { - struct bpf_reg_state *reg = ®s[regno]; - - if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { + if (reg_type_may_be_null(reg->type) && reg->id == id) { /* Old offset (both fixed and variable parts) should * have been known-zero, because we don't allow pointer * arithmetic on pointers that might be NULL. @@ -3617,11 +3620,13 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, } if (is_null) { reg->type = SCALAR_VALUE; - } else if (reg->map_ptr->inner_map_meta) { - reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = reg->map_ptr->inner_map_meta; - } else { - reg->type = PTR_TO_MAP_VALUE; + } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { + if (reg->map_ptr->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = reg->map_ptr->inner_map_meta; + } else { + reg->type = PTR_TO_MAP_VALUE; + } } /* We don't need id from this point onwards anymore, thus we * should better reset it, so that state pruning has chances @@ -3634,8 +3639,8 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ -static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, - bool is_null) +static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, + bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg, *regs = state->regs; @@ -3643,14 +3648,14 @@ static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, int i, j; for (i = 0; i < MAX_BPF_REG; i++) - mark_map_reg(regs, i, id, is_null); + mark_ptr_or_null_reg(®s[i], id, is_null); for (j = 0; j <= vstate->curframe; j++) { state = vstate->frame[j]; bpf_for_each_spilled_reg(i, state, reg) { if (!reg) continue; - mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + mark_ptr_or_null_reg(reg, id, is_null); } } } @@ -3852,12 +3857,14 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && - dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - /* Mark all identical map registers in each branch as either + reg_type_may_be_null(dst_reg->type)) { + /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ - mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE); - mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); + mark_ptr_or_null_regs(this_branch, insn->dst_reg, + opcode == BPF_JNE); + mark_ptr_or_null_regs(other_branch, insn->dst_reg, + opcode == BPF_JEQ); } else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg], this_branch, other_branch) && is_pointer_value(env, insn->dst_reg)) { -- cgit v1.2.3 From c64b7983288e636356f7f5f652de4813e1cfedac Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:33 -0700 Subject: bpf: Add PTR_TO_SOCKET verifier type Teach the verifier a little bit about a new type of pointer, a PTR_TO_SOCKET. This pointer type is accessed from BPF through the 'struct bpf_sock' structure. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 120 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 106 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98b218bd46e8..f86386c9affd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -80,8 +80,8 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * (like pointer plus pointer becomes SCALAR_VALUE type) * * When verifier sees load or store instructions the type of base register - * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK. These are three pointer - * types recognized by check_mem_access() function. + * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are + * four pointer types recognized by check_mem_access() function. * * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' * and the range of [ptr, ptr + map's value_size) is accessible. @@ -267,6 +267,8 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", [PTR_TO_FLOW_KEYS] = "flow_keys", + [PTR_TO_SOCKET] = "sock", + [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", }; static char slot_type_char[] = { @@ -973,6 +975,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PACKET_END: case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: + case PTR_TO_SOCKET: + case PTR_TO_SOCKET_OR_NULL: return true; default: return false; @@ -1341,6 +1345,28 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, return 0; } +static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, + int size, enum bpf_access_type t) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = ®s[regno]; + struct bpf_insn_access_aux info; + + if (reg->smin_value < 0) { + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + regno); + return -EACCES; + } + + if (!bpf_sock_is_valid_access(off, size, t, &info)) { + verbose(env, "invalid bpf_sock access off=%d size=%d\n", + off, size); + return -EACCES; + } + + return 0; +} + static bool __is_pointer_value(bool allow_ptr_leaks, const struct bpf_reg_state *reg) { @@ -1459,6 +1485,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, */ strict = true; break; + case PTR_TO_SOCKET: + pointer_desc = "sock "; + break; default: break; } @@ -1726,6 +1755,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_flow_keys_access(env, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_SOCKET) { + if (t == BPF_WRITE) { + verbose(env, "cannot write into socket\n"); + return -EACCES; + } + err = check_sock_access(env, regno, off, size, t); + if (!err && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -1948,6 +1985,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_ctx_reg(env, reg, regno); if (err < 0) return err; + } else if (arg_type == ARG_PTR_TO_SOCKET) { + expected_type = PTR_TO_SOCKET; + if (type != expected_type) + goto err_type; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -2543,6 +2584,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -2680,6 +2725,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; case CONST_PTR_TO_MAP: case PTR_TO_PACKET_END: + case PTR_TO_SOCKET: + case PTR_TO_SOCKET_OR_NULL: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -3627,6 +3674,8 @@ static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id, } else { reg->type = PTR_TO_MAP_VALUE; } + } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { + reg->type = PTR_TO_SOCKET; } /* We don't need id from this point onwards anymore, thus we * should better reset it, so that state pruning has chances @@ -4402,6 +4451,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case CONST_PTR_TO_MAP: case PTR_TO_PACKET_END: case PTR_TO_FLOW_KEYS: + case PTR_TO_SOCKET: + case PTR_TO_SOCKET_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -4679,6 +4730,37 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; } +/* Return true if it's OK to have the same insn return a different type. */ +static bool reg_type_mismatch_ok(enum bpf_reg_type type) +{ + switch (type) { + case PTR_TO_CTX: + case PTR_TO_SOCKET: + case PTR_TO_SOCKET_OR_NULL: + return false; + default: + return true; + } +} + +/* If an instruction was previously used with particular pointer types, then we + * need to be careful to avoid cases such as the below, where it may be ok + * for one branch accessing the pointer, but not ok for the other branch: + * + * R1 = sock_ptr + * goto X; + * ... + * R1 = some_other_valid_ptr; + * goto X; + * ... + * R2 = *(u32 *)(R1 + 0); + */ +static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) +{ + return src != prev && (!reg_type_mismatch_ok(src) || + !reg_type_mismatch_ok(prev)); +} + static int do_check(struct bpf_verifier_env *env) { struct bpf_verifier_state *state; @@ -4811,9 +4893,7 @@ static int do_check(struct bpf_verifier_env *env) */ *prev_src_type = src_reg_type; - } else if (src_reg_type != *prev_src_type && - (src_reg_type == PTR_TO_CTX || - *prev_src_type == PTR_TO_CTX)) { + } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) { /* ABuser program is trying to use the same insn * dst_reg = *(u32*) (src_reg + off) * with different pointer types: @@ -4858,9 +4938,7 @@ static int do_check(struct bpf_verifier_env *env) if (*prev_dst_type == NOT_INIT) { *prev_dst_type = dst_reg_type; - } else if (dst_reg_type != *prev_dst_type && - (dst_reg_type == PTR_TO_CTX || - *prev_dst_type == PTR_TO_CTX)) { + } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) { verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -5286,8 +5364,10 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) } } -/* convert load instructions that access fields of 'struct __sk_buff' - * into sequence of instructions that access fields of 'struct sk_buff' +/* convert load instructions that access fields of a context type into a + * sequence of instructions that access fields of the underlying structure: + * struct __sk_buff -> struct sk_buff + * struct bpf_sock_ops -> struct sock */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { @@ -5316,12 +5396,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } - if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux)) + if (bpf_prog_is_dev_bound(env->prog->aux)) return 0; insn = env->prog->insnsi + delta; for (i = 0; i < insn_cnt; i++, insn++) { + bpf_convert_ctx_access_t convert_ctx_access; + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || @@ -5363,8 +5445,18 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; } - if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) + switch (env->insn_aux_data[i + delta].ptr_type) { + case PTR_TO_CTX: + if (!ops->convert_ctx_access) + continue; + convert_ctx_access = ops->convert_ctx_access; + break; + case PTR_TO_SOCKET: + convert_ctx_access = bpf_sock_convert_ctx_access; + break; + default: continue; + } ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; size = BPF_LDST_BYTES(insn); @@ -5396,8 +5488,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } target_size = 0; - cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog, - &target_size); + cnt = convert_ctx_access(type, insn, insn_buf, env->prog, + &target_size); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || (ctx_field_size && !target_size)) { verbose(env, "bpf verifier is misconfigured\n"); -- cgit v1.2.3 From 84dbf3507349696b505b6a500722538b0683e4ac Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:34 -0700 Subject: bpf: Macrofy stack state copy An upcoming commit will need very similar copy/realloc boilerplate, so refactor the existing stack copy/realloc functions into macros to simplify it. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 106 ++++++++++++++++++++++++++++---------------------- 1 file changed, 60 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f86386c9affd..11e982381061 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -388,60 +388,74 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "\n"); } -static int copy_stack_state(struct bpf_func_state *dst, - const struct bpf_func_state *src) -{ - if (!src->stack) - return 0; - if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) { - /* internal bug, make state invalid to reject the program */ - memset(dst, 0, sizeof(*dst)); - return -EFAULT; - } - memcpy(dst->stack, src->stack, - sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE)); - return 0; -} +#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE) \ +static int copy_##NAME##_state(struct bpf_func_state *dst, \ + const struct bpf_func_state *src) \ +{ \ + if (!src->FIELD) \ + return 0; \ + if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) { \ + /* internal bug, make state invalid to reject the program */ \ + memset(dst, 0, sizeof(*dst)); \ + return -EFAULT; \ + } \ + memcpy(dst->FIELD, src->FIELD, \ + sizeof(*src->FIELD) * (src->COUNT / SIZE)); \ + return 0; \ +} +/* copy_stack_state() */ +COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) +#undef COPY_STATE_FN + +#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE) \ +static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \ + bool copy_old) \ +{ \ + u32 old_size = state->COUNT; \ + struct bpf_##NAME##_state *new_##FIELD; \ + int slot = size / SIZE; \ + \ + if (size <= old_size || !size) { \ + if (copy_old) \ + return 0; \ + state->COUNT = slot * SIZE; \ + if (!size && old_size) { \ + kfree(state->FIELD); \ + state->FIELD = NULL; \ + } \ + return 0; \ + } \ + new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \ + GFP_KERNEL); \ + if (!new_##FIELD) \ + return -ENOMEM; \ + if (copy_old) { \ + if (state->FIELD) \ + memcpy(new_##FIELD, state->FIELD, \ + sizeof(*new_##FIELD) * (old_size / SIZE)); \ + memset(new_##FIELD + old_size / SIZE, 0, \ + sizeof(*new_##FIELD) * (size - old_size) / SIZE); \ + } \ + state->COUNT = slot * SIZE; \ + kfree(state->FIELD); \ + state->FIELD = new_##FIELD; \ + return 0; \ +} +/* realloc_stack_state() */ +REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) +#undef REALLOC_STATE_FN /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from * the program calls into realloc_func_state() to grow the stack size. - * Note there is a non-zero parent pointer inside each reg of bpf_verifier_state - * which this function copies over. It points to corresponding reg in previous - * bpf_verifier_state which is never reallocated + * Note there is a non-zero 'parent' pointer inside bpf_verifier_state + * which realloc_stack_state() copies over. It points to previous + * bpf_verifier_state which is never reallocated. */ static int realloc_func_state(struct bpf_func_state *state, int size, bool copy_old) { - u32 old_size = state->allocated_stack; - struct bpf_stack_state *new_stack; - int slot = size / BPF_REG_SIZE; - - if (size <= old_size || !size) { - if (copy_old) - return 0; - state->allocated_stack = slot * BPF_REG_SIZE; - if (!size && old_size) { - kfree(state->stack); - state->stack = NULL; - } - return 0; - } - new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state), - GFP_KERNEL); - if (!new_stack) - return -ENOMEM; - if (copy_old) { - if (state->stack) - memcpy(new_stack, state->stack, - sizeof(*new_stack) * (old_size / BPF_REG_SIZE)); - memset(new_stack + old_size / BPF_REG_SIZE, 0, - sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE); - } - state->allocated_stack = slot * BPF_REG_SIZE; - kfree(state->stack); - state->stack = new_stack; - return 0; + return realloc_stack_state(state, size, copy_old); } static void free_func_state(struct bpf_func_state *state) -- cgit v1.2.3 From fd978bf7fd312581a7ca454a991f0ffb34c4204b Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:35 -0700 Subject: bpf: Add reference tracking to verifier Allow helper functions to acquire a reference and return it into a register. Specific pointer types such as the PTR_TO_SOCKET will implicitly represent such a reference. The verifier must ensure that these references are released exactly once in each path through the program. To achieve this, this commit assigns an id to the pointer and tracks it in the 'bpf_func_state', then when the function or program exits, verifies that all of the acquired references have been freed. When the pointer is passed to a function that frees the reference, it is removed from the 'bpf_func_state` and all existing copies of the pointer in registers are marked invalid. Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 306 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 287 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 11e982381061..cd0d8bc00bd1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,5 +1,6 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook + * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -140,6 +141,18 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * * After the call R0 is set to return type of the function and registers R1-R5 * are set to NOT_INIT to indicate that they are no longer readable. + * + * The following reference types represent a potential reference to a kernel + * resource which, after first being allocated, must be checked and freed by + * the BPF program: + * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET + * + * When the verifier sees a helper call return a reference type, it allocates a + * pointer id for the reference and stores it in the current function state. + * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into + * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type + * passes through a NULL-check conditional. For the branch wherein the state is + * changed to CONST_IMM, the verifier releases the reference. */ /* verifier_state + insn_idx are pushed to stack when branch is encountered */ @@ -189,6 +202,7 @@ struct bpf_call_arg_meta { int access_size; s64 msize_smax_value; u64 msize_umax_value; + int ptr_id; }; static DEFINE_MUTEX(bpf_verifier_lock); @@ -251,7 +265,42 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) static bool reg_type_may_be_null(enum bpf_reg_type type) { - return type == PTR_TO_MAP_VALUE_OR_NULL; + return type == PTR_TO_MAP_VALUE_OR_NULL || + type == PTR_TO_SOCKET_OR_NULL; +} + +static bool type_is_refcounted(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET; +} + +static bool type_is_refcounted_or_null(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL; +} + +static bool reg_is_refcounted(const struct bpf_reg_state *reg) +{ + return type_is_refcounted(reg->type); +} + +static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) +{ + return type_is_refcounted_or_null(reg->type); +} + +static bool arg_type_is_refcounted(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_SOCKET; +} + +/* Determine whether the function releases some resources allocated by another + * function call. The first reference type argument will be assumed to be + * released by release_reference(). + */ +static bool is_release_function(enum bpf_func_id func_id) +{ + return false; } /* string representation of 'enum bpf_reg_type' */ @@ -385,6 +434,12 @@ static void print_verifier_state(struct bpf_verifier_env *env, else verbose(env, "=%s", types_buf); } + if (state->acquired_refs && state->refs[0].id) { + verbose(env, " refs=%d", state->refs[0].id); + for (i = 1; i < state->acquired_refs; i++) + if (state->refs[i].id) + verbose(env, ",%d", state->refs[i].id); + } verbose(env, "\n"); } @@ -403,6 +458,8 @@ static int copy_##NAME##_state(struct bpf_func_state *dst, \ sizeof(*src->FIELD) * (src->COUNT / SIZE)); \ return 0; \ } +/* copy_reference_state() */ +COPY_STATE_FN(reference, acquired_refs, refs, 1) /* copy_stack_state() */ COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) #undef COPY_STATE_FN @@ -441,6 +498,8 @@ static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \ state->FIELD = new_##FIELD; \ return 0; \ } +/* realloc_reference_state() */ +REALLOC_STATE_FN(reference, acquired_refs, refs, 1) /* realloc_stack_state() */ REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) #undef REALLOC_STATE_FN @@ -452,16 +511,89 @@ REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) * which realloc_stack_state() copies over. It points to previous * bpf_verifier_state which is never reallocated. */ -static int realloc_func_state(struct bpf_func_state *state, int size, - bool copy_old) +static int realloc_func_state(struct bpf_func_state *state, int stack_size, + int refs_size, bool copy_old) { - return realloc_stack_state(state, size, copy_old); + int err = realloc_reference_state(state, refs_size, copy_old); + if (err) + return err; + return realloc_stack_state(state, stack_size, copy_old); +} + +/* Acquire a pointer id from the env and update the state->refs to include + * this new pointer reference. + * On success, returns a valid pointer id to associate with the register + * On failure, returns a negative errno. + */ +static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) +{ + struct bpf_func_state *state = cur_func(env); + int new_ofs = state->acquired_refs; + int id, err; + + err = realloc_reference_state(state, state->acquired_refs + 1, true); + if (err) + return err; + id = ++env->id_gen; + state->refs[new_ofs].id = id; + state->refs[new_ofs].insn_idx = insn_idx; + + return id; +} + +/* release function corresponding to acquire_reference_state(). Idempotent. */ +static int __release_reference_state(struct bpf_func_state *state, int ptr_id) +{ + int i, last_idx; + + if (!ptr_id) + return -EFAULT; + + last_idx = state->acquired_refs - 1; + for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].id == ptr_id) { + if (last_idx && i != last_idx) + memcpy(&state->refs[i], &state->refs[last_idx], + sizeof(*state->refs)); + memset(&state->refs[last_idx], 0, sizeof(*state->refs)); + state->acquired_refs--; + return 0; + } + } + return -EFAULT; +} + +/* variation on the above for cases where we expect that there must be an + * outstanding reference for the specified ptr_id. + */ +static int release_reference_state(struct bpf_verifier_env *env, int ptr_id) +{ + struct bpf_func_state *state = cur_func(env); + int err; + + err = __release_reference_state(state, ptr_id); + if (WARN_ON_ONCE(err != 0)) + verbose(env, "verifier internal error: can't release reference\n"); + return err; +} + +static int transfer_reference_state(struct bpf_func_state *dst, + struct bpf_func_state *src) +{ + int err = realloc_reference_state(dst, src->acquired_refs, false); + if (err) + return err; + err = copy_reference_state(dst, src); + if (err) + return err; + return 0; } static void free_func_state(struct bpf_func_state *state) { if (!state) return; + kfree(state->refs); kfree(state->stack); kfree(state); } @@ -487,10 +619,14 @@ static int copy_func_state(struct bpf_func_state *dst, { int err; - err = realloc_func_state(dst, src->allocated_stack, false); + err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs, + false); + if (err) + return err; + memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs)); + err = copy_reference_state(dst, src); if (err) return err; - memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack)); return copy_stack_state(dst, src); } @@ -1015,7 +1151,7 @@ static int check_stack_write(struct bpf_verifier_env *env, enum bpf_reg_type type; err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), - true); + state->acquired_refs, true); if (err) return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -1399,7 +1535,8 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) { const struct bpf_reg_state *reg = cur_regs(env) + regno; - return reg->type == PTR_TO_CTX; + return reg->type == PTR_TO_CTX || + reg->type == PTR_TO_SOCKET; } static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) @@ -2003,6 +2140,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_SOCKET; if (type != expected_type) goto err_type; + if (meta->ptr_id || !reg->id) { + verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n", + meta->ptr_id, reg->id); + return -EFAULT; + } + meta->ptr_id = reg->id; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -2292,10 +2435,32 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) return true; } +static bool check_refcount_ok(const struct bpf_func_proto *fn) +{ + int count = 0; + + if (arg_type_is_refcounted(fn->arg1_type)) + count++; + if (arg_type_is_refcounted(fn->arg2_type)) + count++; + if (arg_type_is_refcounted(fn->arg3_type)) + count++; + if (arg_type_is_refcounted(fn->arg4_type)) + count++; + if (arg_type_is_refcounted(fn->arg5_type)) + count++; + + /* We only support one arg being unreferenced at the moment, + * which is sufficient for the helper functions we have right now. + */ + return count <= 1; +} + static int check_func_proto(const struct bpf_func_proto *fn) { return check_raw_mode_ok(fn) && - check_arg_pair_ok(fn) ? 0 : -EINVAL; + check_arg_pair_ok(fn) && + check_refcount_ok(fn) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -2328,12 +2493,45 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) __clear_all_pkt_pointers(env, vstate->frame[i]); } +static void release_reg_references(struct bpf_verifier_env *env, + struct bpf_func_state *state, int id) +{ + struct bpf_reg_state *regs = state->regs, *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + if (regs[i].id == id) + mark_reg_unknown(env, regs, i); + + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) + continue; + if (reg_is_refcounted(reg) && reg->id == id) + __mark_reg_unknown(reg); + } +} + +/* The pointer with the specified id has released its reference to kernel + * resources. Identify all copies of the same pointer and clear the reference. + */ +static int release_reference(struct bpf_verifier_env *env, + struct bpf_call_arg_meta *meta) +{ + struct bpf_verifier_state *vstate = env->cur_state; + int i; + + for (i = 0; i <= vstate->curframe; i++) + release_reg_references(env, vstate->frame[i], meta->ptr_id); + + return release_reference_state(env, meta->ptr_id); +} + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *caller, *callee; - int i, subprog, target_insn; + int i, err, subprog, target_insn; if (state->curframe + 1 >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep\n", @@ -2371,6 +2569,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, state->curframe + 1 /* frameno within this callchain */, subprog /* subprog number within this prog */); + /* Transfer references to the callee */ + err = transfer_reference_state(callee, caller); + if (err) + return err; + /* copy r1 - r5 args that callee can access. The copy includes parent * pointers, which connects us up to the liveness chain */ @@ -2403,6 +2606,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *caller, *callee; struct bpf_reg_state *r0; + int err; callee = state->frame[state->curframe]; r0 = &callee->regs[BPF_REG_0]; @@ -2422,6 +2626,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) /* return to the caller whatever r0 had in the callee */ caller->regs[BPF_REG_0] = *r0; + /* Transfer references to the caller */ + err = transfer_reference_state(caller, callee); + if (err) + return err; + *insn_idx = callee->callsite + 1; if (env->log.level) { verbose(env, "returning from callee:\n"); @@ -2478,6 +2687,18 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return 0; } +static int check_reference_leak(struct bpf_verifier_env *env) +{ + struct bpf_func_state *state = cur_func(env); + int i; + + for (i = 0; i < state->acquired_refs; i++) { + verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", + state->refs[i].id, state->refs[i].insn_idx); + } + return state->acquired_refs ? -EINVAL : 0; +} + static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; @@ -2556,6 +2777,18 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return err; } + if (func_id == BPF_FUNC_tail_call) { + err = check_reference_leak(env); + if (err) { + verbose(env, "tail_call would lead to reference leak\n"); + return err; + } + } else if (is_release_function(func_id)) { + err = release_reference(env, &meta); + if (err) + return err; + } + regs = cur_regs(env); /* check that flags argument in get_local_storage(map, flags) is 0, @@ -2599,9 +2832,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { + int id = acquire_reference_state(env, insn_idx); + if (id < 0) + return id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].id = id; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -3665,7 +3901,8 @@ static void reg_combine_min_max(struct bpf_reg_state *true_src, } } -static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id, +static void mark_ptr_or_null_reg(struct bpf_func_state *state, + struct bpf_reg_state *reg, u32 id, bool is_null) { if (reg_type_may_be_null(reg->type) && reg->id == id) { @@ -3691,11 +3928,13 @@ static void mark_ptr_or_null_reg(struct bpf_reg_state *reg, u32 id, } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { reg->type = PTR_TO_SOCKET; } - /* We don't need id from this point onwards anymore, thus we - * should better reset it, so that state pruning has chances - * to take effect. - */ - reg->id = 0; + if (is_null || !reg_is_refcounted(reg)) { + /* We don't need id from this point onwards anymore, + * thus we should better reset it, so that state + * pruning has chances to take effect. + */ + reg->id = 0; + } } } @@ -3710,15 +3949,18 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, u32 id = regs[regno].id; int i, j; + if (reg_is_refcounted_or_null(®s[regno]) && is_null) + __release_reference_state(state, id); + for (i = 0; i < MAX_BPF_REG; i++) - mark_ptr_or_null_reg(®s[i], id, is_null); + mark_ptr_or_null_reg(state, ®s[i], id, is_null); for (j = 0; j <= vstate->curframe; j++) { state = vstate->frame[j]; bpf_for_each_spilled_reg(i, state, reg) { if (!reg) continue; - mark_ptr_or_null_reg(reg, id, is_null); + mark_ptr_or_null_reg(state, reg, id, is_null); } } } @@ -4050,6 +4292,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + /* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as + * gen_ld_abs() may terminate the program at runtime, leading to + * reference leak. + */ + err = check_reference_leak(env); + if (err) { + verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n"); + return err; + } + if (regs[BPF_REG_6].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -4542,6 +4794,14 @@ static bool stacksafe(struct bpf_func_state *old, return true; } +static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur) +{ + if (old->acquired_refs != cur->acquired_refs) + return false; + return !memcmp(old->refs, cur->refs, + sizeof(*old->refs) * old->acquired_refs); +} + /* compare two verifier states * * all states stored in state_list are known to be valid, since @@ -4587,6 +4847,9 @@ static bool func_states_equal(struct bpf_func_state *old, if (!stacksafe(old, cur, idmap)) goto out_free; + + if (!refsafe(old, cur)) + goto out_free; ret = true; out_free: kfree(idmap); @@ -4868,6 +5131,7 @@ static int do_check(struct bpf_verifier_env *env) regs = cur_regs(env); env->insn_aux_data[insn_idx].seen = true; + if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) @@ -5032,6 +5296,10 @@ static int do_check(struct bpf_verifier_env *env) continue; } + err = check_reference_leak(env); + if (err) + return err; + /* eBPF calling convetion is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time -- cgit v1.2.3 From 6acc9b432e6714d72d7d77ec7c27f6f8358d0c71 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:36 -0700 Subject: bpf: Add helper to retrieve socket in BPF This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and bpf_sk_lookup_udp() which allows BPF programs to find out if there is a socket listening on this host, and returns a socket pointer which the BPF program can then access to determine, for instance, whether to forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the socket, so when a BPF program makes use of this function, it must subsequently pass the returned pointer into the newly added sk_release() to return the reference. By way of example, the following pseudocode would filter inbound connections at XDP if there is no corresponding service listening for the traffic: struct bpf_sock_tuple tuple; struct bpf_sock_ops *sk; populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0); if (!sk) { // Couldn't find a socket listening for this traffic. Drop. return TC_ACT_SHOT; } bpf_sk_release(sk, 0); return TC_ACT_OK; Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cd0d8bc00bd1..73c81bef6ae8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type * passes through a NULL-check conditional. For the branch wherein the state is * changed to CONST_IMM, the verifier releases the reference. + * + * For each helper function that allocates a reference, such as + * bpf_sk_lookup_tcp(), there is a corresponding release function, such as + * bpf_sk_release(). When a reference type passes into the release function, + * the verifier also releases the reference. If any unchecked or unreleased + * reference remains at the end of the program, the verifier rejects it. */ /* verifier_state + insn_idx are pushed to stack when branch is encountered */ @@ -300,7 +306,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return false; + return func_id == BPF_FUNC_sk_release; } /* string representation of 'enum bpf_reg_type' */ -- cgit v1.2.3 From 018303a931a89b91dacd76140b8ebe51893dc5fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 18 Apr 2018 19:15:59 -0500 Subject: signal/sparc: Move EMT_TAGOVF into the generic siginfo.h When moving all of the architectures specific si_codes into siginfo.h, I apparently overlooked EMT_TAGOVF. Move it now. Remove the now redundant test in siginfo_layout for SIGEMT as now NSIGEMT is always defined. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e16278710b36..7b49c31d3fdb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2856,7 +2856,7 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) [SIGSEGV] = { NSIGSEGV, SIL_FAULT }, [SIGBUS] = { NSIGBUS, SIL_FAULT }, [SIGTRAP] = { NSIGTRAP, SIL_FAULT }, -#if defined(SIGEMT) && defined(NSIGEMT) +#if defined(SIGEMT) [SIGEMT] = { NSIGEMT, SIL_FAULT }, #endif [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, -- cgit v1.2.3 From e75dc036c445b91b8b2ad4e6c9b05f04b6be6d3f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 25 Sep 2018 12:04:47 +0200 Subject: signal: Fail sigqueueinfo if si_signo != sig The kernel needs to validate that the contents of struct siginfo make sense as siginfo is copied into the kernel, so that the proper union members can be put in the appropriate locations. The field si_signo is a fundamental part of that validation. As such changing the contents of si_signo after the validation make no sense and can result in nonsense values in the kernel. As such simply fail if someone is silly enough to set si_signo out of sync with the signal number passed to sigqueueinfo. I don't expect a problem as glibc's sigqueue implementation sets "si_signo = sig" and CRIU just returns to the kernel what the kernel gave to it. If there is some application that calls sigqueueinfo directly that has a problem with this added sanity check we can revisit this when we see what kind of crazy that application is doing. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7b49c31d3fdb..e445b0a63faa 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3306,7 +3306,8 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) (task_pid_vnr(current) != pid)) return -EPERM; - info->si_signo = sig; + if (info->si_signo != sig) + return -EINVAL; /* POSIX.1b doesn't mention process groups. */ return kill_proc_info(sig, info, pid); @@ -3354,7 +3355,8 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) (task_pid_vnr(current) != pid)) return -EPERM; - info->si_signo = sig; + if (info->si_signo != sig) + return -EINVAL; return do_send_specific(tgid, pid, sig, info); } -- cgit v1.2.3 From f28380185193610c716a90ec9b9e696638a495ce Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 18 Apr 2018 17:48:49 -0500 Subject: signal: Remove the need for __ARCH_SI_PREABLE_SIZE and SI_PAD_SIZE Rework the defintion of struct siginfo so that the array padding struct siginfo to SI_MAX_SIZE can be placed in a union along side of the rest of the struct siginfo members. The result is that we no longer need the __ARCH_SI_PREAMBLE_SIZE or SI_PAD_SIZE definitions. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e445b0a63faa..debb485a76db 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3963,9 +3963,6 @@ __weak const char *arch_vma_name(struct vm_area_struct *vma) void __init signals_init(void) { - /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */ - BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE - != offsetof(struct siginfo, _sifields._pad)); BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE); sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); -- cgit v1.2.3 From 4cd2e0e70af6897ca2247fa1ffb1553ca16b4903 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 18 Apr 2018 17:30:19 -0500 Subject: signal: Introduce copy_siginfo_from_user and use it's return value In preparation for using a smaller version of siginfo in the kernel introduce copy_siginfo_from_user and use it when siginfo is copied from userspace. Make the pattern for using copy_siginfo_from_user and copy_siginfo_from_user32 to capture the return value and return that value on error. This is a necessary prerequisite for using a smaller siginfo in the kernel than the kernel exports to userspace. Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 12 +++++------- kernel/signal.c | 25 ++++++++++++++++--------- 2 files changed, 21 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 45f77a1b9c97..a807ff5cc1a9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -919,9 +919,8 @@ int ptrace_request(struct task_struct *child, long request, break; case PTRACE_SETSIGINFO: - if (copy_from_user(&siginfo, datavp, sizeof siginfo)) - ret = -EFAULT; - else + ret = copy_siginfo_from_user(&siginfo, datavp); + if (!ret) ret = ptrace_setsiginfo(child, &siginfo); break; @@ -1215,10 +1214,9 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, break; case PTRACE_SETSIGINFO: - if (copy_siginfo_from_user32( - &siginfo, (struct compat_siginfo __user *) datap)) - ret = -EFAULT; - else + ret = copy_siginfo_from_user32( + &siginfo, (struct compat_siginfo __user *) datap); + if (!ret) ret = ptrace_setsiginfo(child, &siginfo); break; #ifdef CONFIG_HAVE_ARCH_TRACEHOOK diff --git a/kernel/signal.c b/kernel/signal.c index debb485a76db..c0e289e62d77 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2896,6 +2896,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) return 0; } +int copy_siginfo_from_user(siginfo_t *to, const siginfo_t __user *from) +{ + if (copy_from_user(to, from, sizeof(struct siginfo))) + return -EFAULT; + return 0; +} + #ifdef CONFIG_COMPAT int copy_siginfo_to_user32(struct compat_siginfo __user *to, const struct siginfo *from) @@ -3323,8 +3330,9 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { siginfo_t info; - if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) - return -EFAULT; + int ret = copy_siginfo_from_user(&info, uinfo); + if (unlikely(ret)) + return ret; return do_rt_sigqueueinfo(pid, sig, &info); } @@ -3365,10 +3373,9 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { siginfo_t info; - - if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) - return -EFAULT; - + int ret = copy_siginfo_from_user(&info, uinfo); + if (unlikely(ret)) + return ret; return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } @@ -3380,9 +3387,9 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, struct compat_siginfo __user *, uinfo) { siginfo_t info; - - if (copy_siginfo_from_user32(&info, uinfo)) - return -EFAULT; + int ret = copy_siginfo_from_user32(&info, uinfo); + if (unlikely(ret)) + return ret; return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } #endif -- cgit v1.2.3 From ae7795bc6187a15ec51cf258abae656a625f9980 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 25 Sep 2018 11:27:20 +0200 Subject: signal: Distinguish between kernel_siginfo and siginfo Linus recently observed that if we did not worry about the padding member in struct siginfo it is only about 48 bytes, and 48 bytes is much nicer than 128 bytes for allocating on the stack and copying around in the kernel. The obvious thing of only adding the padding when userspace is including siginfo.h won't work as there are sigframe definitions in the kernel that embed struct siginfo. So split siginfo in two; kernel_siginfo and siginfo. Keeping the traditional name for the userspace definition. While the version that is used internally to the kernel and ultimately will not be padded to 128 bytes is called kernel_siginfo. The definition of struct kernel_siginfo I have put in include/signal_types.h A set of buildtime checks has been added to verify the two structures have the same field offsets. To make it easy to verify the change kernel_siginfo retains the same size as siginfo. The reduction in size comes in a following change. Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 10 +-- kernel/seccomp.c | 6 +- kernel/signal.c | 151 +++++++++++++++++++++++++++++---------------- kernel/time/posix-timers.c | 2 +- 4 files changed, 108 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a807ff5cc1a9..c2cee9db5204 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -651,7 +651,7 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) return 0; } -static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) +static int ptrace_getsiginfo(struct task_struct *child, kernel_siginfo_t *info) { unsigned long flags; int error = -ESRCH; @@ -667,7 +667,7 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) return error; } -static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) +static int ptrace_setsiginfo(struct task_struct *child, const kernel_siginfo_t *info) { unsigned long flags; int error = -ESRCH; @@ -709,7 +709,7 @@ static int ptrace_peek_siginfo(struct task_struct *child, pending = &child->pending; for (i = 0; i < arg.nr; ) { - siginfo_t info; + kernel_siginfo_t info; s32 off = arg.off + i; spin_lock_irq(&child->sighand->siglock); @@ -885,7 +885,7 @@ int ptrace_request(struct task_struct *child, long request, { bool seized = child->ptrace & PT_SEIZED; int ret = -EIO; - siginfo_t siginfo, *si; + kernel_siginfo_t siginfo, *si; void __user *datavp = (void __user *) data; unsigned long __user *datalp = datavp; unsigned long flags; @@ -1180,7 +1180,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, { compat_ulong_t __user *datap = compat_ptr(data); compat_ulong_t word; - siginfo_t siginfo; + kernel_siginfo_t siginfo; int ret; switch (request) { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index fd023ac24e10..4d7809cdd27d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -522,7 +522,7 @@ void put_seccomp_filter(struct task_struct *tsk) __put_seccomp_filter(tsk->seccomp.filter); } -static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) +static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason) { clear_siginfo(info); info->si_signo = SIGSYS; @@ -542,7 +542,7 @@ static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) */ static void seccomp_send_sigsys(int syscall, int reason) { - struct siginfo info; + struct kernel_siginfo info; seccomp_init_siginfo(&info, syscall, reason); force_sig_info(SIGSYS, &info, current); } @@ -747,7 +747,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* Dump core only if this is the last remaining thread. */ if (action == SECCOMP_RET_KILL_PROCESS || get_nr_threads(current) == 1) { - siginfo_t info; + kernel_siginfo_t info; /* Show the original registers in the dump. */ syscall_rollback(current, task_pt_regs(current)); diff --git a/kernel/signal.c b/kernel/signal.c index c0e289e62d77..161cad4e448c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -549,7 +549,7 @@ bool unhandled_signal(struct task_struct *tsk, int sig) return !tsk->ptrace; } -static void collect_signal(int sig, struct sigpending *list, siginfo_t *info, +static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info, bool *resched_timer) { struct sigqueue *q, *first = NULL; @@ -595,7 +595,7 @@ still_pending: } static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, - siginfo_t *info, bool *resched_timer) + kernel_siginfo_t *info, bool *resched_timer) { int sig = next_signal(pending, mask); @@ -610,7 +610,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, * * All callers have to hold the siglock. */ -int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) +int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *info) { bool resched_timer = false; int signr; @@ -737,12 +737,12 @@ static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) } } -static inline int is_si_special(const struct siginfo *info) +static inline int is_si_special(const struct kernel_siginfo *info) { return info <= SEND_SIG_PRIV; } -static inline bool si_fromuser(const struct siginfo *info) +static inline bool si_fromuser(const struct kernel_siginfo *info) { return info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)); @@ -767,7 +767,7 @@ static bool kill_ok_by_cred(struct task_struct *t) * Bad permissions for sending the signal * - the caller must hold the RCU read lock */ -static int check_kill_permission(int sig, struct siginfo *info, +static int check_kill_permission(int sig, struct kernel_siginfo *info, struct task_struct *t) { struct pid *sid; @@ -1010,7 +1010,7 @@ static inline bool legacy_queue(struct sigpending *signals, int sig) } #ifdef CONFIG_USER_NS -static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t) { if (current_user_ns() == task_cred_xxx(t, user_ns)) return; @@ -1024,13 +1024,13 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str rcu_read_unlock(); } #else -static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t) { return; } #endif -static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, +static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type, int from_ancestor_ns) { struct sigpending *pending; @@ -1150,7 +1150,7 @@ ret: return ret; } -static int send_signal(int sig, struct siginfo *info, struct task_struct *t, +static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type) { int from_ancestor_ns = 0; @@ -1197,12 +1197,12 @@ static int __init setup_print_fatal_signals(char *str) __setup("print-fatal-signals=", setup_print_fatal_signals); int -__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +__group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p) { return send_signal(sig, info, p, PIDTYPE_TGID); } -int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, +int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { unsigned long flags; @@ -1228,7 +1228,7 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, * that is why we also clear SIGNAL_UNKILLABLE. */ int -force_sig_info(int sig, struct siginfo *info, struct task_struct *t) +force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t) { unsigned long int flags; int ret, blocked, ignored; @@ -1316,8 +1316,8 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, /* * send signal info to all the members of a group */ -int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, - enum pid_type type) +int group_send_sig_info(int sig, struct kernel_siginfo *info, + struct task_struct *p, enum pid_type type) { int ret; @@ -1336,7 +1336,7 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, * control characters do (^C, ^Z etc) * - the caller must hold at least a readlock on tasklist_lock */ -int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) +int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; int retval, success; @@ -1351,7 +1351,7 @@ int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) return success ? 0 : retval; } -int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) { int error = -ESRCH; struct task_struct *p; @@ -1373,7 +1373,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) } } -static int kill_proc_info(int sig, struct siginfo *info, pid_t pid) +static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid) { int error; rcu_read_lock(); @@ -1394,7 +1394,7 @@ static inline bool kill_as_cred_perm(const struct cred *cred, } /* like kill_pid_info(), but doesn't use uid/euid of "current" */ -int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, +int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid, const struct cred *cred) { int ret = -EINVAL; @@ -1438,7 +1438,7 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_cred); * is probably wrong. Should make it like BSD or SYSV. */ -static int kill_something_info(int sig, struct siginfo *info, pid_t pid) +static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid) { int ret; @@ -1482,7 +1482,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) * These are for backward compatibility with the rest of the kernel source. */ -int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +int send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p) { /* * Make sure legacy kernel users don't send in bad values @@ -1533,7 +1533,7 @@ int force_sig_fault(int sig, int code, void __user *addr ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t) { - struct siginfo info; + struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; @@ -1556,7 +1556,7 @@ int send_sig_fault(int sig, int code, void __user *addr ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t) { - struct siginfo info; + struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; @@ -1576,7 +1576,7 @@ int send_sig_fault(int sig, int code, void __user *addr int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) { - struct siginfo info; + struct kernel_siginfo info; WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); clear_siginfo(&info); @@ -1590,7 +1590,7 @@ int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) { - struct siginfo info; + struct kernel_siginfo info; WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); clear_siginfo(&info); @@ -1605,7 +1605,7 @@ EXPORT_SYMBOL(send_sig_mceerr); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) { - struct siginfo info; + struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSEGV; @@ -1620,7 +1620,7 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) #ifdef SEGV_PKUERR int force_sig_pkuerr(void __user *addr, u32 pkey) { - struct siginfo info; + struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSEGV; @@ -1637,7 +1637,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) */ int force_sig_ptrace_errno_trap(int errno, void __user *addr) { - struct siginfo info; + struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGTRAP; @@ -1766,7 +1766,7 @@ ret: */ bool do_notify_parent(struct task_struct *tsk, int sig) { - struct siginfo info; + struct kernel_siginfo info; unsigned long flags; struct sighand_struct *psig; bool autoreap = false; @@ -1871,7 +1871,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) static void do_notify_parent_cldstop(struct task_struct *tsk, bool for_ptracer, int why) { - struct siginfo info; + struct kernel_siginfo info; unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; @@ -1971,7 +1971,7 @@ static bool sigkill_pending(struct task_struct *tsk) * If we actually decide not to stop at all because the tracer * is gone, we keep current->exit_code unless clear_code. */ -static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) +static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t *info) __releases(¤t->sighand->siglock) __acquires(¤t->sighand->siglock) { @@ -2108,7 +2108,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) static void ptrace_do_notify(int signr, int exit_code, int why) { - siginfo_t info; + kernel_siginfo_t info; clear_siginfo(&info); info.si_signo = signr; @@ -2289,7 +2289,7 @@ static void do_jobctl_trap(void) } } -static int ptrace_signal(int signr, siginfo_t *info) +static int ptrace_signal(int signr, kernel_siginfo_t *info) { /* * We do not check sig_kernel_stop(signr) but set this marker @@ -2889,14 +2889,14 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) return layout; } -int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) +int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from) { - if (copy_to_user(to, from , sizeof(struct siginfo))) + if (copy_to_user(to, from , sizeof(struct kernel_siginfo))) return -EFAULT; return 0; } -int copy_siginfo_from_user(siginfo_t *to, const siginfo_t __user *from) +int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) { if (copy_from_user(to, from, sizeof(struct siginfo))) return -EFAULT; @@ -2905,13 +2905,13 @@ int copy_siginfo_from_user(siginfo_t *to, const siginfo_t __user *from) #ifdef CONFIG_COMPAT int copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct siginfo *from) + const struct kernel_siginfo *from) #if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) { return __copy_siginfo_to_user32(to, from, in_x32_syscall()); } int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct siginfo *from, bool x32_ABI) + const struct kernel_siginfo *from, bool x32_ABI) #endif { struct compat_siginfo new; @@ -2995,7 +2995,7 @@ int __copy_siginfo_to_user32(struct compat_siginfo __user *to, return 0; } -int copy_siginfo_from_user32(struct siginfo *to, +int copy_siginfo_from_user32(struct kernel_siginfo *to, const struct compat_siginfo __user *ufrom) { struct compat_siginfo from; @@ -3085,7 +3085,7 @@ int copy_siginfo_from_user32(struct siginfo *to, * @info: if non-null, the signal's siginfo is returned here * @ts: upper bound on process time suspension */ -static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, +static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, const struct timespec *ts) { ktime_t *to = NULL, timeout = KTIME_MAX; @@ -3149,7 +3149,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, { sigset_t these; struct timespec ts; - siginfo_t info; + kernel_siginfo_t info; int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ @@ -3181,7 +3181,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, { sigset_t s; struct timespec t; - siginfo_t info; + kernel_siginfo_t info; long ret; if (sigsetsize != sizeof(sigset_t)) @@ -3213,7 +3213,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, */ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { - struct siginfo info; + struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; @@ -3226,7 +3226,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) } static int -do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) +do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) { struct task_struct *p; int error = -ESRCH; @@ -3257,7 +3257,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) static int do_tkill(pid_t tgid, pid_t pid, int sig) { - struct siginfo info; + struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; @@ -3304,7 +3304,7 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) return do_tkill(0, pid, sig); } -static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) +static int do_rt_sigqueueinfo(pid_t pid, int sig, kernel_siginfo_t *info) { /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. @@ -3329,7 +3329,7 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info) SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { - siginfo_t info; + kernel_siginfo_t info; int ret = copy_siginfo_from_user(&info, uinfo); if (unlikely(ret)) return ret; @@ -3342,7 +3342,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info; + kernel_siginfo_t info; int ret = copy_siginfo_from_user32(&info, uinfo); if (unlikely(ret)) return ret; @@ -3350,7 +3350,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, } #endif -static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) +static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, kernel_siginfo_t *info) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) @@ -3372,7 +3372,7 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { - siginfo_t info; + kernel_siginfo_t info; int ret = copy_siginfo_from_user(&info, uinfo); if (unlikely(ret)) return ret; @@ -3386,7 +3386,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info; + kernel_siginfo_t info; int ret = copy_siginfo_from_user32(&info, uinfo); if (unlikely(ret)) return ret; @@ -3968,10 +3968,57 @@ __weak const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } -void __init signals_init(void) +static inline void siginfo_buildtime_checks(void) { BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE); + /* Verify the offsets in the two siginfos match */ +#define CHECK_OFFSET(field) \ + BUILD_BUG_ON(offsetof(siginfo_t, field) != offsetof(kernel_siginfo_t, field)) + + /* kill */ + CHECK_OFFSET(si_pid); + CHECK_OFFSET(si_uid); + + /* timer */ + CHECK_OFFSET(si_tid); + CHECK_OFFSET(si_overrun); + CHECK_OFFSET(si_value); + + /* rt */ + CHECK_OFFSET(si_pid); + CHECK_OFFSET(si_uid); + CHECK_OFFSET(si_value); + + /* sigchld */ + CHECK_OFFSET(si_pid); + CHECK_OFFSET(si_uid); + CHECK_OFFSET(si_status); + CHECK_OFFSET(si_utime); + CHECK_OFFSET(si_stime); + + /* sigfault */ + CHECK_OFFSET(si_addr); + CHECK_OFFSET(si_addr_lsb); + CHECK_OFFSET(si_lower); + CHECK_OFFSET(si_upper); + CHECK_OFFSET(si_pkey); + + /* sigpoll */ + CHECK_OFFSET(si_band); + CHECK_OFFSET(si_fd); + + /* sigsys */ + CHECK_OFFSET(si_call_addr); + CHECK_OFFSET(si_syscall); + CHECK_OFFSET(si_arch); +#undef CHECK_OFFSET +} + +void __init signals_init(void) +{ + siginfo_buildtime_checks(); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4b9127e95430..eabb4c22728d 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -308,7 +308,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr) * To protect against the timer going away while the interrupt is queued, * we require that the it_requeue_pending flag be set. */ -void posixtimer_rearm(struct siginfo *info) +void posixtimer_rearm(struct kernel_siginfo *info) { struct k_itimer *timr; unsigned long flags; -- cgit v1.2.3 From 4ce5f9c9e7546915c559ffae594e6d73f918db00 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 25 Sep 2018 12:59:31 +0200 Subject: signal: Use a smaller struct siginfo in the kernel We reserve 128 bytes for struct siginfo but only use about 48 bytes on 64bit and 32 bytes on 32bit. Someday we might use more but it is unlikely to be anytime soon. Userspace seems content with just enough bytes of siginfo to implement sigqueue. Or in the case of checkpoint/restart reinjecting signals the kernel has sent. Reducing the stack footprint and the work to copy siginfo around from 2 cachelines to 1 cachelines seems worth doing even if I don't have benchmarks to show a performance difference. Suggested-by: Linus Torvalds Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 82 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 161cad4e448c..1c2dd117fee0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2844,27 +2844,48 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, } #endif +static const struct { + unsigned char limit, layout; +} sig_sicodes[] = { + [SIGILL] = { NSIGILL, SIL_FAULT }, + [SIGFPE] = { NSIGFPE, SIL_FAULT }, + [SIGSEGV] = { NSIGSEGV, SIL_FAULT }, + [SIGBUS] = { NSIGBUS, SIL_FAULT }, + [SIGTRAP] = { NSIGTRAP, SIL_FAULT }, +#if defined(SIGEMT) + [SIGEMT] = { NSIGEMT, SIL_FAULT }, +#endif + [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, + [SIGPOLL] = { NSIGPOLL, SIL_POLL }, + [SIGSYS] = { NSIGSYS, SIL_SYS }, +}; + +static bool known_siginfo_layout(int sig, int si_code) +{ + if (si_code == SI_KERNEL) + return true; + else if ((si_code > SI_USER)) { + if (sig_specific_sicodes(sig)) { + if (si_code <= sig_sicodes[sig].limit) + return true; + } + else if (si_code <= NSIGPOLL) + return true; + } + else if (si_code >= SI_DETHREAD) + return true; + else if (si_code == SI_ASYNCNL) + return true; + return false; +} + enum siginfo_layout siginfo_layout(int sig, int si_code) { enum siginfo_layout layout = SIL_KILL; if ((si_code > SI_USER) && (si_code < SI_KERNEL)) { - static const struct { - unsigned char limit, layout; - } filter[] = { - [SIGILL] = { NSIGILL, SIL_FAULT }, - [SIGFPE] = { NSIGFPE, SIL_FAULT }, - [SIGSEGV] = { NSIGSEGV, SIL_FAULT }, - [SIGBUS] = { NSIGBUS, SIL_FAULT }, - [SIGTRAP] = { NSIGTRAP, SIL_FAULT }, -#if defined(SIGEMT) - [SIGEMT] = { NSIGEMT, SIL_FAULT }, -#endif - [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, - [SIGPOLL] = { NSIGPOLL, SIL_POLL }, - [SIGSYS] = { NSIGSYS, SIL_SYS }, - }; - if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) { - layout = filter[sig].layout; + if ((sig < ARRAY_SIZE(sig_sicodes)) && + (si_code <= sig_sicodes[sig].limit)) { + layout = sig_sicodes[sig].layout; /* Handle the exceptions */ if ((sig == SIGBUS) && (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO)) @@ -2889,17 +2910,42 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) return layout; } +static inline char __user *si_expansion(const siginfo_t __user *info) +{ + return ((char __user *)info) + sizeof(struct kernel_siginfo); +} + int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from) { + char __user *expansion = si_expansion(to); if (copy_to_user(to, from , sizeof(struct kernel_siginfo))) return -EFAULT; + if (clear_user(expansion, SI_EXPANSION_SIZE)) + return -EFAULT; return 0; } int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) { - if (copy_from_user(to, from, sizeof(struct siginfo))) + if (copy_from_user(to, from, sizeof(struct kernel_siginfo))) return -EFAULT; + if (unlikely(!known_siginfo_layout(to->si_signo, to->si_code))) { + char __user *expansion = si_expansion(from); + char buf[SI_EXPANSION_SIZE]; + int i; + /* + * An unknown si_code might need more than + * sizeof(struct kernel_siginfo) bytes. Verify all of the + * extra bytes are 0. This guarantees copy_siginfo_to_user + * will return this data to userspace exactly. + */ + if (copy_from_user(&buf, expansion, SI_EXPANSION_SIZE)) + return -EFAULT; + for (i = 0; i < SI_EXPANSION_SIZE; i++) { + if (buf[i] != 0) + return -E2BIG; + } + } return 0; } -- cgit v1.2.3 From 277fcdb2cfee38ccdbe07e705dbd4896ba0c9930 Mon Sep 17 00:00:00 2001 From: He Zhe Date: Sun, 30 Sep 2018 00:45:50 +0800 Subject: printk: Fix panic caused by passing log_buf_len to command line log_buf_len_setup does not check input argument before passing it to simple_strtoull. The argument would be a NULL pointer if "log_buf_len", without its value, is set in command line and thus causes the following panic. PANIC: early exception 0xe3 IP 10:ffffffffaaeacd0d error 0 cr2 0x0 [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc4-yocto-standard+ #1 [ 0.000000] RIP: 0010:_parse_integer_fixup_radix+0xd/0x70 ... [ 0.000000] Call Trace: [ 0.000000] simple_strtoull+0x29/0x70 [ 0.000000] memparse+0x26/0x90 [ 0.000000] log_buf_len_setup+0x17/0x22 [ 0.000000] do_early_param+0x57/0x8e [ 0.000000] parse_args+0x208/0x320 [ 0.000000] ? rdinit_setup+0x30/0x30 [ 0.000000] parse_early_options+0x29/0x2d [ 0.000000] ? rdinit_setup+0x30/0x30 [ 0.000000] parse_early_param+0x36/0x4d [ 0.000000] setup_arch+0x336/0x99e [ 0.000000] start_kernel+0x6f/0x4ee [ 0.000000] x86_64_start_reservations+0x24/0x26 [ 0.000000] x86_64_start_kernel+0x6f/0x72 [ 0.000000] secondary_startup_64+0xa4/0xb0 This patch adds a check to prevent the panic. Link: http://lkml.kernel.org/r/1538239553-81805-1-git-send-email-zhe.he@windriver.com Cc: stable@vger.kernel.org Cc: rostedt@goodmis.org Cc: linux-kernel@vger.kernel.org Signed-off-by: He Zhe Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c5b568c2d167..a14f15ad0f35 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1049,7 +1049,12 @@ static void __init log_buf_len_update(unsigned size) /* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { - unsigned size = memparse(str, &str); + unsigned int size; + + if (!str) + return -EINVAL; + + size = memparse(str, &str); log_buf_len_update(size); -- cgit v1.2.3 From 51a72ab7372d85c96104e58036f1b49ba11e5d2b Mon Sep 17 00:00:00 2001 From: He Zhe Date: Sun, 30 Sep 2018 00:45:51 +0800 Subject: printk: Correct wrong casting log_first_seq and console_seq are 64-bit unsigned integers. Correct a wrong casting that might cut off the output. Link: http://lkml.kernel.org/r/1538239553-81805-2-git-send-email-zhe.he@windriver.com Cc: rostedt@goodmis.org Cc: linux-kernel@vger.kernel.org Signed-off-by: He Zhe [sergey.senozhatsky@gmail.com: More descriptive commit message] Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a14f15ad0f35..d4704baf6aaf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2358,8 +2358,9 @@ again: printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); if (console_seq < log_first_seq) { - len = sprintf(text, "** %u printk messages dropped **\n", - (unsigned)(log_first_seq - console_seq)); + len = sprintf(text, + "** %llu printk messages dropped **\n", + log_first_seq - console_seq); /* messages are gone, move to first one */ console_seq = log_first_seq; -- cgit v1.2.3 From dd5adbfbfc50ff998909660c6158c0488f7b6a2b Mon Sep 17 00:00:00 2001 From: He Zhe Date: Sun, 30 Sep 2018 00:45:52 +0800 Subject: printk: Add KBUILD_MODNAME and remove a redundant print prefix Add KBUILD_MODNAME to make prints more clear. Link: http://lkml.kernel.org/r/1538239553-81805-3-git-send-email-zhe.he@windriver.com Cc: rostedt@goodmis.org Cc: linux-kernel@vger.kernel.org Signed-off-by: He Zhe Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d4704baf6aaf..15f3e70be448 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -16,6 +16,8 @@ * 01Mar01 Andrew Morton */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -2699,7 +2701,7 @@ void register_console(struct console *newcon) if (newcon->flags & CON_EXTENDED) if (!nr_ext_console_drivers++) - pr_info("printk: continuation disabled due to ext consoles, expect more fragments in /dev/kmsg\n"); + pr_info("continuation disabled due to ext consoles, expect more fragments in /dev/kmsg\n"); if (newcon->flags & CON_PRINTBUFFER) { /* -- cgit v1.2.3 From 601d5abfeaf244b86bb68c1e05c6e0d57be2f6b0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 5 Oct 2018 09:02:48 +0200 Subject: signal: In sigqueueinfo prefer sig not si_signo Andrei Vagin reported: > Accoding to the man page, the user should not set si_signo, it has to be set > by kernel. > > $ man 2 rt_sigqueueinfo > > The uinfo argument specifies the data to accompany the signal. This > argument is a pointer to a structure of type siginfo_t, described in > sigaction(2) (and defined by including ). The caller > should set the following fields in this structure: > > si_code > This must be one of the SI_* codes in the Linux kernel source > file include/asm-generic/siginfo.h, with the restriction that > the code must be negative (i.e., cannot be SI_USER, which is > used by the kernel to indicate a signal sent by kill(2)) and > cannot (since Linux 2.6.39) be SI_TKILL (which is used by the > kernel to indicate a signal sent using tgkill(2)). > > si_pid This should be set to a process ID, typically the process ID of > the sender. > > si_uid This should be set to a user ID, typically the real user ID of > the sender. > > si_value > This field contains the user data to accompany the signal. For > more information, see the description of the last (union sigval) > argument of sigqueue(3). > > Internally, the kernel sets the si_signo field to the value specified > in sig, so that the receiver of the signal can also obtain the signal > number via that field. > > On Tue, Sep 25, 2018 at 07:19:02PM +0200, Eric W. Biederman wrote: >> >> If there is some application that calls sigqueueinfo directly that has >> a problem with this added sanity check we can revisit this when we see >> what kind of crazy that application is doing. > > > I already know two "applications" ;) > > https://github.com/torvalds/linux/blob/master/tools/testing/selftests/ptrace/peeksiginfo.c > https://github.com/checkpoint-restore/criu/blob/master/test/zdtm/static/sigpending.c > > Disclaimer: I'm the author of both of them. Looking at the kernel code the historical behavior has alwasy been to prefer the signal number passed in by the kernel. So sigh. Implmenet __copy_siginfo_from_user and __copy_siginfo_from_user32 to take that signal number and prefer it. The user of ptrace will still use copy_siginfo_from_user and copy_siginfo_from_user32 as they do not and never have had a signal number there. Luckily this change has never made it farther than linux-next. Fixes: e75dc036c445 ("signal: Fail sigqueueinfo if si_signo != sig") Reported-by: Andrei Vagin Tested-by: Andrei Vagin Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 141 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 84 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 1c2dd117fee0..2bffc5a50183 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2925,11 +2925,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from) return 0; } -int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) +static int post_copy_siginfo_from_user(kernel_siginfo_t *info, + const siginfo_t __user *from) { - if (copy_from_user(to, from, sizeof(struct kernel_siginfo))) - return -EFAULT; - if (unlikely(!known_siginfo_layout(to->si_signo, to->si_code))) { + if (unlikely(!known_siginfo_layout(info->si_signo, info->si_code))) { char __user *expansion = si_expansion(from); char buf[SI_EXPANSION_SIZE]; int i; @@ -2949,6 +2948,22 @@ int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) return 0; } +static int __copy_siginfo_from_user(int signo, kernel_siginfo_t *to, + const siginfo_t __user *from) +{ + if (copy_from_user(to, from, sizeof(struct kernel_siginfo))) + return -EFAULT; + to->si_signo = signo; + return post_copy_siginfo_from_user(to, from); +} + +int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) +{ + if (copy_from_user(to, from, sizeof(struct kernel_siginfo))) + return -EFAULT; + return post_copy_siginfo_from_user(to, from); +} + #ifdef CONFIG_COMPAT int copy_siginfo_to_user32(struct compat_siginfo __user *to, const struct kernel_siginfo *from) @@ -3041,88 +3056,106 @@ int __copy_siginfo_to_user32(struct compat_siginfo __user *to, return 0; } -int copy_siginfo_from_user32(struct kernel_siginfo *to, - const struct compat_siginfo __user *ufrom) +static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, + const struct compat_siginfo *from) { - struct compat_siginfo from; - - if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) - return -EFAULT; - clear_siginfo(to); - to->si_signo = from.si_signo; - to->si_errno = from.si_errno; - to->si_code = from.si_code; - switch(siginfo_layout(from.si_signo, from.si_code)) { + to->si_signo = from->si_signo; + to->si_errno = from->si_errno; + to->si_code = from->si_code; + switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: - to->si_pid = from.si_pid; - to->si_uid = from.si_uid; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; break; case SIL_TIMER: - to->si_tid = from.si_tid; - to->si_overrun = from.si_overrun; - to->si_int = from.si_int; + to->si_tid = from->si_tid; + to->si_overrun = from->si_overrun; + to->si_int = from->si_int; break; case SIL_POLL: - to->si_band = from.si_band; - to->si_fd = from.si_fd; + to->si_band = from->si_band; + to->si_fd = from->si_fd; break; case SIL_FAULT: - to->si_addr = compat_ptr(from.si_addr); + to->si_addr = compat_ptr(from->si_addr); #ifdef __ARCH_SI_TRAPNO - to->si_trapno = from.si_trapno; + to->si_trapno = from->si_trapno; #endif break; case SIL_FAULT_MCEERR: - to->si_addr = compat_ptr(from.si_addr); + to->si_addr = compat_ptr(from->si_addr); #ifdef __ARCH_SI_TRAPNO - to->si_trapno = from.si_trapno; + to->si_trapno = from->si_trapno; #endif - to->si_addr_lsb = from.si_addr_lsb; + to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: - to->si_addr = compat_ptr(from.si_addr); + to->si_addr = compat_ptr(from->si_addr); #ifdef __ARCH_SI_TRAPNO - to->si_trapno = from.si_trapno; + to->si_trapno = from->si_trapno; #endif - to->si_lower = compat_ptr(from.si_lower); - to->si_upper = compat_ptr(from.si_upper); + to->si_lower = compat_ptr(from->si_lower); + to->si_upper = compat_ptr(from->si_upper); break; case SIL_FAULT_PKUERR: - to->si_addr = compat_ptr(from.si_addr); + to->si_addr = compat_ptr(from->si_addr); #ifdef __ARCH_SI_TRAPNO - to->si_trapno = from.si_trapno; + to->si_trapno = from->si_trapno; #endif - to->si_pkey = from.si_pkey; + to->si_pkey = from->si_pkey; break; case SIL_CHLD: - to->si_pid = from.si_pid; - to->si_uid = from.si_uid; - to->si_status = from.si_status; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_status = from->si_status; #ifdef CONFIG_X86_X32_ABI if (in_x32_syscall()) { - to->si_utime = from._sifields._sigchld_x32._utime; - to->si_stime = from._sifields._sigchld_x32._stime; + to->si_utime = from->_sifields._sigchld_x32._utime; + to->si_stime = from->_sifields._sigchld_x32._stime; } else #endif { - to->si_utime = from.si_utime; - to->si_stime = from.si_stime; + to->si_utime = from->si_utime; + to->si_stime = from->si_stime; } break; case SIL_RT: - to->si_pid = from.si_pid; - to->si_uid = from.si_uid; - to->si_int = from.si_int; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_int = from->si_int; break; case SIL_SYS: - to->si_call_addr = compat_ptr(from.si_call_addr); - to->si_syscall = from.si_syscall; - to->si_arch = from.si_arch; + to->si_call_addr = compat_ptr(from->si_call_addr); + to->si_syscall = from->si_syscall; + to->si_arch = from->si_arch; break; } return 0; } + +static int __copy_siginfo_from_user32(int signo, struct kernel_siginfo *to, + const struct compat_siginfo __user *ufrom) +{ + struct compat_siginfo from; + + if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) + return -EFAULT; + + from.si_signo = signo; + return post_copy_siginfo_from_user32(to, &from); +} + +int copy_siginfo_from_user32(struct kernel_siginfo *to, + const struct compat_siginfo __user *ufrom) +{ + struct compat_siginfo from; + + if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) + return -EFAULT; + + return post_copy_siginfo_from_user32(to, &from); +} #endif /* CONFIG_COMPAT */ /** @@ -3359,9 +3392,6 @@ static int do_rt_sigqueueinfo(pid_t pid, int sig, kernel_siginfo_t *info) (task_pid_vnr(current) != pid)) return -EPERM; - if (info->si_signo != sig) - return -EINVAL; - /* POSIX.1b doesn't mention process groups. */ return kill_proc_info(sig, info, pid); } @@ -3376,7 +3406,7 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { kernel_siginfo_t info; - int ret = copy_siginfo_from_user(&info, uinfo); + int ret = __copy_siginfo_from_user(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_sigqueueinfo(pid, sig, &info); @@ -3389,7 +3419,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, struct compat_siginfo __user *, uinfo) { kernel_siginfo_t info; - int ret = copy_siginfo_from_user32(&info, uinfo); + int ret = __copy_siginfo_from_user32(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_sigqueueinfo(pid, sig, &info); @@ -3409,9 +3439,6 @@ static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, kernel_siginfo_t (task_pid_vnr(current) != pid)) return -EPERM; - if (info->si_signo != sig) - return -EINVAL; - return do_send_specific(tgid, pid, sig, info); } @@ -3419,7 +3446,7 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { kernel_siginfo_t info; - int ret = copy_siginfo_from_user(&info, uinfo); + int ret = __copy_siginfo_from_user(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); @@ -3433,7 +3460,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, struct compat_siginfo __user *, uinfo) { kernel_siginfo_t info; - int ret = copy_siginfo_from_user32(&info, uinfo); + int ret = __copy_siginfo_from_user32(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); -- cgit v1.2.3 From c941ce9c282cc606e6517356fcc186a9da2b4ab9 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Sun, 7 Oct 2018 12:56:47 +0100 Subject: bpf: add verifier callback to get stack usage info for offloaded progs In preparation for BPF-to-BPF calls in offloaded programs, add a new function attribute to the struct bpf_prog_offload_ops so that drivers supporting eBPF offload can hook at the end of program verification, and potentially extract information collected by the verifier. Implement a minimal callback (returning 0) in the drivers providing the structs, namely netdevsim and nfp. This will be useful in the nfp driver, in later commits, to extract the number of subprograms as well as the stack depth for those subprograms. Signed-off-by: Quentin Monnet Reviewed-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 18 ++++++++++++++++++ kernel/bpf/verifier.c | 3 +++ 2 files changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 177a52436394..8e93c47f0779 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -172,6 +172,24 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, return ret; } +int bpf_prog_offload_finalize(struct bpf_verifier_env *env) +{ + struct bpf_prog_offload *offload; + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) { + if (offload->dev_ops->finalize) + ret = offload->dev_ops->finalize(env); + else + ret = 0; + } + up_read(&bpf_devs_lock); + + return ret; +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_prog_offload *offload = prog->aux->offload; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 73c81bef6ae8..a0454cb299ba 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6309,6 +6309,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env->cur_state = NULL; } + if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux)) + ret = bpf_prog_offload_finalize(env); + skip_full_check: while (!pop_stack(env, NULL, NULL)); free_states(env); -- cgit v1.2.3 From e4052d06a5195b29271a7af262711d69f9ecfd04 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Sun, 7 Oct 2018 12:56:58 +0100 Subject: bpf: allow offload of programs with BPF-to-BPF function calls Now that there is at least one driver supporting BPF-to-BPF function calls, lift the restriction, in the verifier, on hardware offload of eBPF programs containing such calls. But prevent jit_subprogs(), still in the verifier, from being run for offloaded programs. Signed-off-by: Quentin Monnet Reviewed-by: Jiong Wang Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a0454cb299ba..73cc136915fe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1009,10 +1009,6 @@ static int check_subprogs(struct bpf_verifier_env *env) verbose(env, "function calls to other bpf functions are allowed for root only\n"); return -EPERM; } - if (bpf_prog_is_dev_bound(env->prog->aux)) { - verbose(env, "function calls in offloaded programs are not supported yet\n"); - return -EINVAL; - } ret = add_subprog(env, i + insn[i].imm + 1); if (ret < 0) return ret; @@ -5968,10 +5964,10 @@ static int fixup_call_args(struct bpf_verifier_env *env) struct bpf_insn *insn = prog->insnsi; int i, depth; #endif - int err; + int err = 0; - err = 0; - if (env->prog->jit_requested) { + if (env->prog->jit_requested && + !bpf_prog_is_dev_bound(env->prog->aux)) { err = jit_subprogs(env); if (err == 0) return 0; -- cgit v1.2.3 From b8d62f33b7b225935649ab165d901fe8dd7f95e5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 8 Oct 2018 13:17:26 +0200 Subject: genirq: Fix grammar s/an /a / Fix a grammar mistake in . [ mingo: While at it also fix another similar error in another comment as well. ] Signed-off-by: Geert Uytterhoeven Cc: Jiri Kosina Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20181008111726.26286-1-geert%2Brenesas@glider.be Signed-off-by: Ingo Molnar --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3b30a4aeb0db..3366d11c3e02 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -867,7 +867,7 @@ void irq_dispose_mapping(unsigned int virq) EXPORT_SYMBOL_GPL(irq_dispose_mapping); /** - * irq_find_mapping() - Find a linux irq from an hw irq number. + * irq_find_mapping() - Find a linux irq from a hw irq number. * @domain: domain owning this hardware interrupt * @hwirq: hardware irq number in that domain space */ -- cgit v1.2.3 From e6fe3e5b7d16e8f146a4ae7fe481bc6e97acde1e Mon Sep 17 00:00:00 2001 From: He Zhe Date: Sun, 30 Sep 2018 00:45:53 +0800 Subject: printk: Give error on attempt to set log buffer length to over 2G The current printk() is ready to handle log buffer size up to 2G. Give an explicit error for users who want to use larger log buffer. Also fix printk formatting to show the 2G as a positive number. Link: http://lkml.kernel.org/r/20181008135916.gg4kkmoki5bgtco5@pathway.suse.cz Cc: rostedt@goodmis.org Cc: linux-kernel@vger.kernel.org Suggested-by: Sergey Senozhatsky Signed-off-by: He Zhe Reviewed-by: Sergey Senozhatsky [pmladek: Fixed to the really safe limit 2GB.] Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 15f3e70be448..fce696d80e09 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -440,6 +440,7 @@ static u32 clear_idx; /* record buffer */ #define LOG_ALIGN __alignof__(struct printk_log) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#define LOG_BUF_LEN_MAX (u32)(1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -1040,18 +1041,23 @@ void log_buf_vmcoreinfo_setup(void) static unsigned long __initdata new_log_buf_len; /* we practice scaling the ring buffer by powers of 2 */ -static void __init log_buf_len_update(unsigned size) +static void __init log_buf_len_update(u64 size) { + if (size > (u64)LOG_BUF_LEN_MAX) { + size = (u64)LOG_BUF_LEN_MAX; + pr_err("log_buf over 2G is not supported.\n"); + } + if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) - new_log_buf_len = size; + new_log_buf_len = (unsigned long)size; } /* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { - unsigned int size; + u64 size; if (!str) return -EINVAL; @@ -1121,7 +1127,7 @@ void __init setup_log_buf(int early) } if (unlikely(!new_log_buf)) { - pr_err("log_buf_len: %ld bytes not available\n", + pr_err("log_buf_len: %lu bytes not available\n", new_log_buf_len); return; } @@ -1134,8 +1140,8 @@ void __init setup_log_buf(int early) memcpy(log_buf, __log_buf, __LOG_BUF_LEN); logbuf_unlock_irqrestore(flags); - pr_info("log_buf_len: %d bytes\n", log_buf_len); - pr_info("early log buf free: %d(%d%%)\n", + pr_info("log_buf_len: %u bytes\n", log_buf_len); + pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); } -- cgit v1.2.3 From 8af03d1ae2e154a8be3631e8694b87007e1bdbc2 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Sun, 7 Oct 2018 15:23:15 -0500 Subject: bpf: btf: Fix a missing check bug In btf_parse_hdr(), the length of the btf data header is firstly copied from the user space to 'hdr_len' and checked to see whether it is larger than 'btf_data_size'. If yes, an error code EINVAL is returned. Otherwise, the whole header is copied again from the user space to 'btf->hdr'. However, after the second copy, there is no check between 'btf->hdr->hdr_len' and 'hdr_len' to confirm that the two copies get the same value. Given that the btf data is in the user space, a malicious user can race to change the data between the two copies. By doing so, the user can provide malicious data to the kernel and cause undefined behavior. This patch adds a necessary check after the second copy, to make sure 'btf->hdr->hdr_len' has the same value as 'hdr_len'. Otherwise, an error code EINVAL will be returned. Signed-off-by: Wenwen Wang Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 138f0302692e..378cef70341c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2114,6 +2114,9 @@ static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, hdr = &btf->hdr; + if (hdr->hdr_len != hdr_len) + return -EINVAL; + btf_verifier_log_hdr(env, btf_data_size); if (hdr->magic != BTF_MAGIC) { -- cgit v1.2.3 From 509db2833e0ddac7faf6e7d2dd6e7f85c98fbee0 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Tue, 9 Oct 2018 10:04:49 +0900 Subject: bpf: error handling when map_lookup_elem isn't supported The error value returned by map_lookup_elem doesn't differentiate whether lookup was failed because of invalid key or lookup is not supported. Lets add handling for -EOPNOTSUPP return value of map_lookup_elem() method of map, with expectation from map's implementation that it should return -EOPNOTSUPP if lookup is not supported. The errno for bpf syscall for BPF_MAP_LOOKUP_ELEM command will be set to EOPNOTSUPP if map lookup is not supported. Signed-off-by: Prashant Bhole Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5742df21598c..4f416234251f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -719,10 +719,15 @@ static int map_lookup_elem(union bpf_attr *attr) } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); - if (ptr) + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + } else if (!ptr) { + err = -ENOENT; + } else { + err = 0; memcpy(value, ptr, value_size); + } rcu_read_unlock(); - err = ptr ? 0 : -ENOENT; } if (err) -- cgit v1.2.3 From 3b4a63f674e94795c9b76a41b36364b8aec232e1 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Tue, 9 Oct 2018 10:04:50 +0900 Subject: bpf: return EOPNOTSUPP when map lookup isn't supported Return ERR_PTR(-EOPNOTSUPP) from map_lookup_elem() methods of below map types: - BPF_MAP_TYPE_PROG_ARRAY - BPF_MAP_TYPE_STACK_TRACE - BPF_MAP_TYPE_XSKMAP - BPF_MAP_TYPE_SOCKMAP/BPF_MAP_TYPE_SOCKHASH Signed-off-by: Prashant Bhole Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 2 +- kernel/bpf/sockmap.c | 2 +- kernel/bpf/stackmap.c | 2 +- kernel/bpf/xskmap.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index dded84cbe814..24583da9ffd1 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -449,7 +449,7 @@ static void fd_array_map_free(struct bpf_map *map) static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } /* only called from syscall */ diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index d37a1a0a6e1e..5d0677d808ae 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2096,7 +2096,7 @@ int sockmap_get_from_fd(const union bpf_attr *attr, int type, static void *sock_map_lookup(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } static int sock_map_update_elem(struct bpf_map *map, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 8061a439ef18..b2ade10f7ec3 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -505,7 +505,7 @@ const struct bpf_func_proto bpf_get_stack_proto = { /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } /* Called from syscall */ diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 9f8463afda9c..ef0b7b6ef8a5 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -154,7 +154,7 @@ void __xsk_map_flush(struct bpf_map *map) static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, -- cgit v1.2.3 From d59e0ba19481c0046d2ea2bd0e5344eeaf45aace Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Tue, 9 Oct 2018 11:43:35 -0400 Subject: tick/sched : Remove redundant cpu_online() check can_stop_idle_tick() checks cpu_online() twice. The first check leaves the function when the CPU is not online, so the second check it redundant. Remove it. Signed-off-by: Peng Hao Signed-off-by: Thomas Gleixner Cc: fweisbec@gmail.com Link: https://lkml.kernel.org/r/1539099815-2943-1-git-send-email-penghao122@sina.com.cn --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 5b33e2f5c0ed..69e673b88474 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -885,7 +885,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (need_resched()) return false; - if (unlikely(local_softirq_pending() && cpu_online(cpu))) { + if (unlikely(local_softirq_pending())) { static int ratelimit; if (ratelimit < 10 && -- cgit v1.2.3 From b2a2ab527d6de02fbf2331bae4a299d58ab52266 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 10 Oct 2018 20:11:25 -0500 Subject: signal: Guard against negative signal numbers in copy_siginfo_from_user The bounds checks in known_siginfo_layout only guards against positive numbers that are too large, large negative can slip through and can cause out of bounds accesses. Ordinarily this is not a concern because early in signal processing the signal number is filtered with valid_signal which ensures it is a small positive signal number, but copy_siginfo_from_user is called before this check is performed. [ 73.031126] BUG: unable to handle kernel paging request at ffffffff6281bcb6 [ 73.032038] PGD 3014067 P4D 3014067 PUD 0 [ 73.032565] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI [ 73.033287] CPU: 0 PID: 732 Comm: trinity-c3 Tainted: G W T 4.19.0-rc1-00077-g4ce5f9c #1 [ 73.034423] RIP: 0010:copy_siginfo_from_user+0x4d/0xd0 [ 73.034908] Code: 00 8b 53 08 81 fa 80 00 00 00 0f 84 90 00 00 00 85 d2 7e 2d 48 63 0b 83 f9 1f 7f 1c 8d 71 ff bf d8 04 01 50 48 0f a3 f7 73 0e <0f> b6 8c 09 20 bb 81 82 39 ca 7f 15 eb 68 31 c0 83 fa 06 7f 0c eb [ 73.036665] RSP: 0018:ffff88001b8f7e20 EFLAGS: 00010297 [ 73.037160] RAX: 0000000000000000 RBX: ffff88001b8f7e90 RCX: fffffffff00000cb [ 73.037865] RDX: 0000000000000001 RSI: 00000000f00000ca RDI: 00000000500104d8 [ 73.038546] RBP: ffff88001b8f7e80 R08: 0000000000000000 R09: 0000000000000000 [ 73.039201] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000008 [ 73.039874] R13: 00000000000002dc R14: 0000000000000000 R15: 0000000000000000 [ 73.040613] FS: 000000000104a880(0000) GS:ffff88001f000000(0000) knlGS:0000000000000000 [ 73.041649] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 73.042405] CR2: ffffffff6281bcb6 CR3: 000000001cb52003 CR4: 00000000001606b0 [ 73.043351] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 73.044286] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 [ 73.045221] Call Trace: [ 73.045556] __x64_sys_rt_tgsigqueueinfo+0x34/0xa0 [ 73.046199] do_syscall_64+0x1a4/0x390 [ 73.046708] ? vtime_user_enter+0x61/0x80 [ 73.047242] ? __context_tracking_enter+0x4e/0x60 [ 73.047714] ? __context_tracking_enter+0x4e/0x60 [ 73.048278] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Therefore fix known_siginfo_layout to take an unsigned signal number instead of a signed signal number. All valid signal numbers are small positive numbers so they will not be affected, but invalid negative signal numbers will now become large positive signal numbers and will not be used as indices into the sig_sicodes array. Making the signal number unsigned makes it difficult for similar mistakes to happen in the future. Fixes: 4ce5f9c9e754 ("signal: Use a smaller struct siginfo in the kernel") Inspired-by: Sean Christopherson Reported-by: kernel test robot Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2bffc5a50183..5f5bf374512b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2860,7 +2860,7 @@ static const struct { [SIGSYS] = { NSIGSYS, SIL_SYS }, }; -static bool known_siginfo_layout(int sig, int si_code) +static bool known_siginfo_layout(unsigned sig, int si_code) { if (si_code == SI_KERNEL) return true; -- cgit v1.2.3 From a36700589b85443e28170be59fa11c8a104130a5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 10 Oct 2018 20:29:44 -0500 Subject: signal: Guard against negative signal numbers in copy_siginfo_from_user32 While fixing an out of bounds array access in known_siginfo_layout reported by the kernel test robot it became apparent that the same bug exists in siginfo_layout and affects copy_siginfo_from_user32. The straight forward fix that makes guards against making this mistake in the future and should keep the code size small is to just take an unsigned signal number instead of a signed signal number, as I did to fix known_siginfo_layout. Cc: stable@vger.kernel.org Fixes: cc731525f26a ("signal: Remove kernel interal si_code magic") Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 5f5bf374512b..4fd431ce4f91 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2879,7 +2879,7 @@ static bool known_siginfo_layout(unsigned sig, int si_code) return false; } -enum siginfo_layout siginfo_layout(int sig, int si_code) +enum siginfo_layout siginfo_layout(unsigned sig, int si_code) { enum siginfo_layout layout = SIL_KILL; if ((si_code > SI_USER) && (si_code < SI_KERNEL)) { -- cgit v1.2.3 From 9627808d2d409279cea3fb334212d04a83ff6371 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 2 Oct 2018 11:38:34 +0900 Subject: printk: keep kernel cont support always enabled Since commit 5c2992ee7fd8a29 ("printk: remove console flushing special cases for partial buffered lines") we don't print cont fragments to the consoles; cont lines are now proper log_buf entries and there is no "consecutive continuation flag" anymore: we either have 'c' entries that mark continuation lines without fragments; or '-' entries that mark normal logbuf entries. There are no '+' entries anymore. However, we still have a small leftover - presence of ext_console drivers disables kernel cont support and we flush each pr_cont() and store it as a separate log_buf entry. Previously, it worked because msg_print_ext_header() had that "an optional external merge of the records" functionality: if (msg->flags & LOG_CONT) cont = (prev_flags & LOG_CONT) ? '+' : 'c'; We don't do this as of now, so keep kernel cont always enabled. Note from pmladek: The original purpose was to get full information including the metadata and dictionary via extended console drivers, see commit 6fe29354befe4c46e ("printk: implement support for extended console drivers"). The dictionary probably was the most important part but it was actually lost: static void cont_flush(void) { [...] log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec, NULL, 0, cont.buf, cont.len); Nobody noticed because the only dictionary user is dev_printk() and dev_cont() is _not_ defined. Link: http://lkml.kernel.org/r/20181002023836.4487-2-sergey.senozhatsky@gmail.com To: Steven Rostedt Cc: Andrew Morton Cc: Dmitriy Vyukov Cc: Tetsuo Handa Cc: Tejun Heo Cc: Peter Zijlstra Cc: LKML Cc: Sergey Senozhatsky Signed-off-by: Sergey Senozhatsky [pmladek@suse.com: Updated commit message] Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fce696d80e09..f717656c0fac 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -194,16 +194,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, return 0; } -/* - * Number of registered extended console drivers. - * - * If extended consoles are present, in-kernel cont reassembly is disabled - * and each fragment is stored as a separate log entry with proper - * continuation flag so that every emitted message has full metadata. This - * doesn't change the result for regular consoles or /proc/kmsg. For - * /dev/kmsg, as long as the reader concatenates messages according to - * consecutive continuation flags, the end result should be the same too. - */ +/* Number of registered extended console drivers. */ static int nr_ext_console_drivers; /* @@ -1781,12 +1772,8 @@ static void cont_flush(void) static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) { - /* - * If ext consoles are present, flush and skip in-kernel - * continuation. See nr_ext_console_drivers definition. Also, if - * the line gets too long, split it up in separate records. - */ - if (nr_ext_console_drivers || cont.len + len > sizeof(cont.buf)) { + /* If the line gets too long, split it up in separate records. */ + if (cont.len + len > sizeof(cont.buf)) { cont_flush(); return false; } @@ -2706,8 +2693,7 @@ void register_console(struct console *newcon) } if (newcon->flags & CON_EXTENDED) - if (!nr_ext_console_drivers++) - pr_info("continuation disabled due to ext consoles, expect more fragments in /dev/kmsg\n"); + nr_ext_console_drivers++; if (newcon->flags & CON_PRINTBUFFER) { /* -- cgit v1.2.3 From 3ac37a93fa9217e576bebfd4ba3e80edaaeb2289 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 2 Oct 2018 11:38:35 +0900 Subject: printk: lock/unlock console only for new logbuf entries Prior to commit 5c2992ee7fd8a29 ("printk: remove console flushing special cases for partial buffered lines") we would do console_cont_flush() for each pr_cont() to print cont fragments, so console_unlock() would actually print data: pr_cont(); console_lock(); console_unlock() console_cont_flush(); // print cont fragment ... pr_cont(); console_lock(); console_unlock() console_cont_flush(); // print cont fragment We don't do console_cont_flush() anymore, so when we do pr_cont() console_unlock() does nothing (unless we flushed the cont buffer): pr_cont(); console_lock(); console_unlock(); // noop ... pr_cont(); console_lock(); console_unlock(); // noop ... pr_cont(); cont_flush(); console_lock(); console_unlock(); // print data We also wakeup klogd purposelessly for pr_cont() output - un-flushed cont buffer is not stored in log_buf; there is nothing to pull. Thus we can console_lock()/console_unlock()/wake_up_klogd() only when we know that we log_store()-ed a message and there is something to print to the consoles/syslog. Link: http://lkml.kernel.org/r/20181002023836.4487-3-sergey.senozhatsky@gmail.com To: Steven Rostedt Cc: Andrew Morton Cc: Dmitriy Vyukov Cc: Tetsuo Handa Cc: Tejun Heo Cc: Peter Zijlstra Cc: LKML Cc: Sergey Senozhatsky Signed-off-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f717656c0fac..e9a7e50ed60a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1890,8 +1890,9 @@ asmlinkage int vprintk_emit(int facility, int level, const char *fmt, va_list args) { int printed_len; - bool in_sched = false; + bool in_sched = false, pending_output; unsigned long flags; + u64 curr_log_seq; if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; @@ -1903,11 +1904,13 @@ asmlinkage int vprintk_emit(int facility, int level, /* This stops the holder of console_sem just where we want him */ logbuf_lock_irqsave(flags); + curr_log_seq = log_next_seq; printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); + pending_output = (curr_log_seq != log_next_seq); logbuf_unlock_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { + if (!in_sched && pending_output) { /* * Disable preemption to avoid being preempted while holding * console_sem which would prevent anyone from printing to @@ -1924,7 +1927,8 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } - wake_up_klogd(); + if (pending_output) + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); -- cgit v1.2.3 From 0e96a19c4450253c3ddcff69140b1096f2c2adaf Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 2 Oct 2018 11:38:36 +0900 Subject: printk: do not preliminary split up cont buffer We have a proper 'overflow' check which tells us that we need to split up existing cont buffer in separate records: if (cont.len + len > sizeof(cont.buf)) cont_flush(); At the same time we also have one extra flush: "if cont buffer is 80% full then split it up" in cont_add(): if (cont.len > (sizeof(cont.buf) * 80) / 100) cont_flush(); This looks to be redundant, since the existing "overflow" check should work just fine, so remove this 80% check and wait for either a normal cont termination \n, for preliminary flush due to possible buffer overflow or for preliminary flush due to cont race. Link: http://lkml.kernel.org/r/20181002023836.4487-4-sergey.senozhatsky@gmail.com To: Steven Rostedt Cc: Andrew Morton Cc: Dmitriy Vyukov Cc: Tetsuo Handa Cc: Tejun Heo Cc: Peter Zijlstra Cc: LKML Cc: Sergey Senozhatsky Signed-off-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e9a7e50ed60a..505663bb4f1f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1796,9 +1796,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * cont_flush(); } - if (cont.len > (sizeof(cont.buf) * 80) / 100) - cont_flush(); - return true; } -- cgit v1.2.3 From d2130e82e9454304e9b91ba9da551b5989af8c27 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 10 Oct 2018 20:33:08 +0900 Subject: printk: fix integer overflow in setup_log_buf() The way we calculate logbuf free space percentage overflows signed integer: int free; free = __LOG_BUF_LEN - log_next_idx; pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); We support LOG_BUF_LEN of up to 1<<25 bytes. Since setup_log_buf() is called during early init, logbuf is mostly empty, so __LOG_BUF_LEN - log_next_idx is close to 1<<25. Thus when we multiply it by 100, we overflow signed integer value range: 100 is 2^6 + 2^5 + 2^2. Example, booting with LOG_BUF_LEN 1<<25 and log_buf_len=2G boot param: [ 0.075317] log_buf_len: -2147483648 bytes [ 0.075319] early log buf free: 33549896(-28%) Make "free" unsigned integer and use appropriate printk() specifier. Link: http://lkml.kernel.org/r/20181010113308.9337-1-sergey.senozhatsky@gmail.com To: Steven Rostedt Cc: linux-kernel@vger.kernel.org Cc: Sergey Senozhatsky Signed-off-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 505663bb4f1f..b77150ad1965 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1098,7 +1098,7 @@ void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; - int free; + unsigned int free; if (log_buf != __log_buf) return; -- cgit v1.2.3 From 1ae80cf31938c8f77c37a29bbe29e7f1cd492be8 Mon Sep 17 00:00:00 2001 From: Daniel Colascione Date: Fri, 12 Oct 2018 03:54:27 -0700 Subject: bpf: wait for running BPF programs when updating map-in-map The map-in-map frequently serves as a mechanism for atomic snapshotting of state that a BPF program might record. The current implementation is dangerous to use in this way, however, since userspace has no way of knowing when all programs that might have retrieved the "old" value of the map may have completed. This change ensures that map update operations on map-in-map map types always wait for all references to the old map to drop before returning to userspace. Signed-off-by: Daniel Colascione Reviewed-by: Joel Fernandes (Google) Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4f416234251f..53968f82b919 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -748,6 +748,17 @@ err_put: return err; } +static void maybe_wait_bpf_programs(struct bpf_map *map) +{ + /* Wait for any running BPF programs to complete so that + * userspace, when we return to it, knows that all programs + * that could be running use the new map value. + */ + if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || + map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) + synchronize_rcu(); +} + #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags static int map_update_elem(union bpf_attr *attr) @@ -842,6 +853,7 @@ static int map_update_elem(union bpf_attr *attr) } __this_cpu_dec(bpf_prog_active); preempt_enable(); + maybe_wait_bpf_programs(map); out: free_value: kfree(value); @@ -894,6 +906,7 @@ static int map_delete_elem(union bpf_attr *attr) rcu_read_unlock(); __this_cpu_dec(bpf_prog_active); preempt_enable(); + maybe_wait_bpf_programs(map); out: kfree(key); err_put: -- cgit v1.2.3 From 1243a51f6c05ecbb2c5c9e02fdcc1e7a06f76f26 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 13 Oct 2018 02:45:57 +0200 Subject: tcp, ulp: remove ulp bits from sockmap In order to prepare sockmap logic to be used in combination with kTLS we need to detangle it from ULP, and further split it in later commits into a generic API. Joint work with John. Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 39 ++++++++++----------------------------- 1 file changed, 10 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5d0677d808ae..de6f7a65c72b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -182,6 +182,7 @@ enum { static struct proto *saved_tcpv6_prot __read_mostly; static DEFINE_SPINLOCK(tcpv6_prot_lock); static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; + static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], struct proto *base) { @@ -239,6 +240,13 @@ static int bpf_tcp_init(struct sock *sk) return 0; } +static int __init bpf_sock_init(void) +{ + build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); + return 0; +} +core_initcall(bpf_sock_init); + static void smap_release_sock(struct smap_psock *psock, struct sock *sock); static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); @@ -413,15 +421,6 @@ enum __sk_action { __SK_NONE, }; -static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { - .name = "bpf_tcp", - .uid = TCP_ULP_BPF, - .user_visible = false, - .owner = NULL, - .init = bpf_tcp_init, - .release = bpf_tcp_release, -}; - static int memcopy_from_iter(struct sock *sk, struct sk_msg_buff *md, struct iov_iter *from, int bytes) @@ -1236,16 +1235,6 @@ static void bpf_tcp_msg_add(struct smap_psock *psock, bpf_prog_put(orig_tx_msg); } -static int bpf_tcp_ulp_register(void) -{ - build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); - /* Once BPF TX ULP is registered it is never unregistered. It - * will be in the ULP list for the lifetime of the system. Doing - * duplicate registers is not a problem. - */ - return tcp_register_ulp(&bpf_tcp_ulp_ops); -} - static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); @@ -1491,7 +1480,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { if (refcount_dec_and_test(&psock->refcnt)) { if (psock_is_smap_sk(sock)) - tcp_cleanup_ulp(sock); + bpf_tcp_release(sock); write_lock_bh(&sock->sk_callback_lock); smap_stop_sock(psock, sock); write_unlock_bh(&sock->sk_callback_lock); @@ -1666,10 +1655,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - err = bpf_tcp_ulp_register(); - if (err && err != -EEXIST) - return ERR_PTR(err); - stab = kzalloc(sizeof(*stab), GFP_USER); if (!stab) return ERR_PTR(-ENOMEM); @@ -1951,7 +1936,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, if (tx_msg) bpf_tcp_msg_add(psock, sock, tx_msg); if (new) { - err = tcp_set_ulp_id(sock, TCP_ULP_BPF); + err = bpf_tcp_init(sock); if (err) goto out_free; } @@ -2187,10 +2172,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) */ return ERR_PTR(-E2BIG); - err = bpf_tcp_ulp_register(); - if (err && err != -EEXIST) - return ERR_PTR(err); - htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 604326b41a6fb9b4a78b6179335decee0365cd8c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 13 Oct 2018 02:45:58 +0200 Subject: bpf, sockmap: convert to generic sk_msg interface Add a generic sk_msg layer, and convert current sockmap and later kTLS over to make use of it. While sk_buff handles network packet representation from netdevice up to socket, sk_msg handles data representation from application to socket layer. This means that sk_msg framework spans across ULP users in the kernel, and enables features such as introspection or filtering of data with the help of BPF programs that operate on this data structure. Latter becomes in particular useful for kTLS where data encryption is deferred into the kernel, and as such enabling the kernel to perform L7 introspection and policy based on BPF for TLS connections where the record is being encrypted after BPF has run and came to a verdict. In order to get there, first step is to transform open coding of scatter-gather list handling into a common core framework that subsystems can use. The code itself has been split and refactored into three bigger pieces: i) the generic sk_msg API which deals with managing the scatter gather ring, providing helpers for walking and mangling, transferring application data from user space into it, and preparing it for BPF pre/post-processing, ii) the plain sock map itself where sockets can be attached to or detached from; these bits are independent of i) which can now be used also without sock map, and iii) the integration with plain TCP as one protocol to be used for processing L7 application data (later this could e.g. also be extended to other protocols like UDP). The semantics are the same with the old sock map code and therefore no change of user facing behavior or APIs. While pursuing this work it also helped finding a number of bugs in the old sockmap code that we've fixed already in earlier commits. The test_sockmap kselftest suite passes through fine as well. Joint work with John. Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 5 - kernel/bpf/core.c | 2 - kernel/bpf/sockmap.c | 2610 -------------------------------------------------- kernel/bpf/syscall.c | 6 +- 4 files changed, 3 insertions(+), 2620 deletions(-) delete mode 100644 kernel/bpf/sockmap.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 0488b8258321..ff8262626b8f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -13,11 +13,6 @@ ifeq ($(CONFIG_XDP_SOCKETS),y) obj-$(CONFIG_BPF_SYSCALL) += xskmap.o endif obj-$(CONFIG_BPF_SYSCALL) += offload.o -ifeq ($(CONFIG_STREAM_PARSER),y) -ifeq ($(CONFIG_INET),y) -obj-$(CONFIG_BPF_SYSCALL) += sockmap.o -endif -endif endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3f5bf1af0826..defcf4df6d91 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1792,8 +1792,6 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; -const struct bpf_func_proto bpf_sock_map_update_proto __weak; -const struct bpf_func_proto bpf_sock_hash_update_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c deleted file mode 100644 index de6f7a65c72b..000000000000 --- a/kernel/bpf/sockmap.c +++ /dev/null @@ -1,2610 +0,0 @@ -/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -/* A BPF sock_map is used to store sock objects. This is primarly used - * for doing socket redirect with BPF helper routines. - * - * A sock map may have BPF programs attached to it, currently a program - * used to parse packets and a program to provide a verdict and redirect - * decision on the packet are supported. Any programs attached to a sock - * map are inherited by sock objects when they are added to the map. If - * no BPF programs are attached the sock object may only be used for sock - * redirect. - * - * A sock object may be in multiple maps, but can only inherit a single - * parse or verdict program. If adding a sock object to a map would result - * in having multiple parsing programs the update will return an EBUSY error. - * - * For reference this program is similar to devmap used in XDP context - * reviewing these together may be useful. For an example please review - * ./samples/bpf/sockmap/. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define SOCK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - -struct bpf_sock_progs { - struct bpf_prog *bpf_tx_msg; - struct bpf_prog *bpf_parse; - struct bpf_prog *bpf_verdict; -}; - -struct bpf_stab { - struct bpf_map map; - struct sock **sock_map; - struct bpf_sock_progs progs; - raw_spinlock_t lock; -}; - -struct bucket { - struct hlist_head head; - raw_spinlock_t lock; -}; - -struct bpf_htab { - struct bpf_map map; - struct bucket *buckets; - atomic_t count; - u32 n_buckets; - u32 elem_size; - struct bpf_sock_progs progs; - struct rcu_head rcu; -}; - -struct htab_elem { - struct rcu_head rcu; - struct hlist_node hash_node; - u32 hash; - struct sock *sk; - char key[0]; -}; - -enum smap_psock_state { - SMAP_TX_RUNNING, -}; - -struct smap_psock_map_entry { - struct list_head list; - struct bpf_map *map; - struct sock **entry; - struct htab_elem __rcu *hash_link; -}; - -struct smap_psock { - struct rcu_head rcu; - refcount_t refcnt; - - /* datapath variables */ - struct sk_buff_head rxqueue; - bool strp_enabled; - - /* datapath error path cache across tx work invocations */ - int save_rem; - int save_off; - struct sk_buff *save_skb; - - /* datapath variables for tx_msg ULP */ - struct sock *sk_redir; - int apply_bytes; - int cork_bytes; - int sg_size; - int eval; - struct sk_msg_buff *cork; - struct list_head ingress; - - struct strparser strp; - struct bpf_prog *bpf_tx_msg; - struct bpf_prog *bpf_parse; - struct bpf_prog *bpf_verdict; - struct list_head maps; - spinlock_t maps_lock; - - /* Back reference used when sock callback trigger sockmap operations */ - struct sock *sock; - unsigned long state; - - struct work_struct tx_work; - struct work_struct gc_work; - - struct proto *sk_proto; - void (*save_unhash)(struct sock *sk); - void (*save_close)(struct sock *sk, long timeout); - void (*save_data_ready)(struct sock *sk); - void (*save_write_space)(struct sock *sk); -}; - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len); -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags); -static void bpf_tcp_unhash(struct sock *sk); -static void bpf_tcp_close(struct sock *sk, long timeout); - -static inline struct smap_psock *smap_psock_sk(const struct sock *sk) -{ - return rcu_dereference_sk_user_data(sk); -} - -static bool bpf_tcp_stream_read(const struct sock *sk) -{ - struct smap_psock *psock; - bool empty = true; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - empty = list_empty(&psock->ingress); -out: - rcu_read_unlock(); - return !empty; -} - -enum { - SOCKMAP_IPV4, - SOCKMAP_IPV6, - SOCKMAP_NUM_PROTS, -}; - -enum { - SOCKMAP_BASE, - SOCKMAP_TX, - SOCKMAP_NUM_CONFIGS, -}; - -static struct proto *saved_tcpv6_prot __read_mostly; -static DEFINE_SPINLOCK(tcpv6_prot_lock); -static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS]; - -static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], - struct proto *base) -{ - prot[SOCKMAP_BASE] = *base; - prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; - prot[SOCKMAP_BASE].close = bpf_tcp_close; - prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; - prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; - - prot[SOCKMAP_TX] = prot[SOCKMAP_BASE]; - prot[SOCKMAP_TX].sendmsg = bpf_tcp_sendmsg; - prot[SOCKMAP_TX].sendpage = bpf_tcp_sendpage; -} - -static void update_sk_prot(struct sock *sk, struct smap_psock *psock) -{ - int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4; - int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE; - - sk->sk_prot = &bpf_tcp_prots[family][conf]; -} - -static int bpf_tcp_init(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return -EINVAL; - } - - if (unlikely(psock->sk_proto)) { - rcu_read_unlock(); - return -EBUSY; - } - - psock->save_unhash = sk->sk_prot->unhash; - psock->save_close = sk->sk_prot->close; - psock->sk_proto = sk->sk_prot; - - /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ - if (sk->sk_family == AF_INET6 && - unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { - spin_lock_bh(&tcpv6_prot_lock); - if (likely(sk->sk_prot != saved_tcpv6_prot)) { - build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot); - smp_store_release(&saved_tcpv6_prot, sk->sk_prot); - } - spin_unlock_bh(&tcpv6_prot_lock); - } - update_sk_prot(sk, psock); - rcu_read_unlock(); - return 0; -} - -static int __init bpf_sock_init(void) -{ - build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot); - return 0; -} -core_initcall(bpf_sock_init); - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge); - -static void bpf_tcp_release(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - psock->cork = NULL; - } - - if (psock->sk_proto) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; - } -out: - rcu_read_unlock(); -} - -static struct htab_elem *lookup_elem_raw(struct hlist_head *head, - u32 hash, void *key, u32 key_size) -{ - struct htab_elem *l; - - hlist_for_each_entry_rcu(l, head, hash_node) { - if (l->hash == hash && !memcmp(&l->key, key, key_size)) - return l; - } - - return NULL; -} - -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &htab->buckets[hash & (htab->n_buckets - 1)]; -} - -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) -{ - return &__select_bucket(htab, hash)->head; -} - -static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) -{ - atomic_dec(&htab->count); - kfree_rcu(l, rcu); -} - -static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, - struct smap_psock *psock) -{ - struct smap_psock_map_entry *e; - - spin_lock_bh(&psock->maps_lock); - e = list_first_entry_or_null(&psock->maps, - struct smap_psock_map_entry, - list); - if (e) - list_del(&e->list); - spin_unlock_bh(&psock->maps_lock); - return e; -} - -static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) -{ - struct smap_psock_map_entry *e; - struct sk_msg_buff *md, *mtmp; - struct sock *osk; - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - psock->cork = NULL; - } - - list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { - list_del(&md->list); - free_start_sg(psock->sock, md, true); - kfree(md); - } - - e = psock_map_pop(sk, psock); - while (e) { - if (e->entry) { - struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map); - - raw_spin_lock_bh(&stab->lock); - osk = *e->entry; - if (osk == sk) { - *e->entry = NULL; - smap_release_sock(psock, sk); - } - raw_spin_unlock_bh(&stab->lock); - } else { - struct htab_elem *link = rcu_dereference(e->hash_link); - struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map); - struct hlist_head *head; - struct htab_elem *l; - struct bucket *b; - - b = __select_bucket(htab, link->hash); - head = &b->head; - raw_spin_lock_bh(&b->lock); - l = lookup_elem_raw(head, - link->hash, link->key, - htab->map.key_size); - /* If another thread deleted this object skip deletion. - * The refcnt on psock may or may not be zero. - */ - if (l && l == link) { - hlist_del_rcu(&link->hash_node); - smap_release_sock(psock, link->sk); - free_htab_elem(htab, link); - } - raw_spin_unlock_bh(&b->lock); - } - kfree(e); - e = psock_map_pop(sk, psock); - } -} - -static void bpf_tcp_unhash(struct sock *sk) -{ - void (*unhash_fun)(struct sock *sk); - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - if (sk->sk_prot->unhash) - sk->sk_prot->unhash(sk); - return; - } - unhash_fun = psock->save_unhash; - bpf_tcp_remove(sk, psock); - rcu_read_unlock(); - unhash_fun(sk); -} - -static void bpf_tcp_close(struct sock *sk, long timeout) -{ - void (*close_fun)(struct sock *sk, long timeout); - struct smap_psock *psock; - - lock_sock(sk); - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - close_fun = psock->save_close; - bpf_tcp_remove(sk, psock); - rcu_read_unlock(); - release_sock(sk); - close_fun(sk, timeout); -} - -enum __sk_action { - __SK_DROP = 0, - __SK_PASS, - __SK_REDIRECT, - __SK_NONE, -}; - -static int memcopy_from_iter(struct sock *sk, - struct sk_msg_buff *md, - struct iov_iter *from, int bytes) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_curr, rc = -ENOSPC; - - do { - int copy; - char *to; - - if (md->sg_copybreak >= sg[i].length) { - md->sg_copybreak = 0; - - if (++i == MAX_SKB_FRAGS) - i = 0; - - if (i == md->sg_end) - break; - } - - copy = sg[i].length - md->sg_copybreak; - to = sg_virt(&sg[i]) + md->sg_copybreak; - md->sg_copybreak += copy; - - if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) - rc = copy_from_iter_nocache(to, copy, from); - else - rc = copy_from_iter(to, copy, from); - - if (rc != copy) { - rc = -EFAULT; - goto out; - } - - bytes -= copy; - if (!bytes) - break; - - md->sg_copybreak = 0; - if (++i == MAX_SKB_FRAGS) - i = 0; - } while (i != md->sg_end); -out: - md->sg_curr = i; - return rc; -} - -static int bpf_tcp_push(struct sock *sk, int apply_bytes, - struct sk_msg_buff *md, - int flags, bool uncharge) -{ - bool apply = apply_bytes; - struct scatterlist *sg; - int offset, ret = 0; - struct page *p; - size_t size; - - while (1) { - sg = md->sg_data + md->sg_start; - size = (apply && apply_bytes < sg->length) ? - apply_bytes : sg->length; - offset = sg->offset; - - tcp_rate_check_app_limited(sk); - p = sg_page(sg); -retry: - ret = do_tcp_sendpages(sk, p, offset, size, flags); - if (ret != size) { - if (ret > 0) { - if (apply) - apply_bytes -= ret; - - sg->offset += ret; - sg->length -= ret; - size -= ret; - offset += ret; - if (uncharge) - sk_mem_uncharge(sk, ret); - goto retry; - } - - return ret; - } - - if (apply) - apply_bytes -= ret; - sg->offset += ret; - sg->length -= ret; - if (uncharge) - sk_mem_uncharge(sk, ret); - - if (!sg->length) { - put_page(p); - md->sg_start++; - if (md->sg_start == MAX_SKB_FRAGS) - md->sg_start = 0; - sg_init_table(sg, 1); - - if (md->sg_start == md->sg_end) - break; - } - - if (apply && !apply_bytes) - break; - } - return 0; -} - -static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) -{ - struct scatterlist *sg = md->sg_data + md->sg_start; - - if (md->sg_copy[md->sg_start]) { - md->data = md->data_end = 0; - } else { - md->data = sg_virt(sg); - md->data_end = md->data + sg->length; - } -} - -static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_start; - - do { - int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; - - sk_mem_uncharge(sk, uncharge); - bytes -= uncharge; - if (!bytes) - break; - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - } while (i != md->sg_end); -} - -static void free_bytes_sg(struct sock *sk, int bytes, - struct sk_msg_buff *md, bool charge) -{ - struct scatterlist *sg = md->sg_data; - int i = md->sg_start, free; - - while (bytes && sg[i].length) { - free = sg[i].length; - if (bytes < free) { - sg[i].length -= bytes; - sg[i].offset += bytes; - if (charge) - sk_mem_uncharge(sk, bytes); - break; - } - - if (charge) - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); - bytes -= sg[i].length; - sg[i].length = 0; - sg[i].page_link = 0; - sg[i].offset = 0; - i++; - - if (i == MAX_SKB_FRAGS) - i = 0; - } - md->sg_start = i; -} - -static int free_sg(struct sock *sk, int start, - struct sk_msg_buff *md, bool charge) -{ - struct scatterlist *sg = md->sg_data; - int i = start, free = 0; - - while (sg[i].length) { - free += sg[i].length; - if (charge) - sk_mem_uncharge(sk, sg[i].length); - if (!md->skb) - put_page(sg_page(&sg[i])); - sg[i].length = 0; - sg[i].page_link = 0; - sg[i].offset = 0; - i++; - - if (i == MAX_SKB_FRAGS) - i = 0; - } - consume_skb(md->skb); - - return free; -} - -static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge) -{ - int free = free_sg(sk, md->sg_start, md, charge); - - md->sg_start = md->sg_end; - return free; -} - -static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) -{ - return free_sg(sk, md->sg_curr, md, true); -} - -static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) -{ - return ((_rc == SK_PASS) ? - (md->sk_redir ? __SK_REDIRECT : __SK_PASS) : - __SK_DROP); -} - -static unsigned int smap_do_tx_msg(struct sock *sk, - struct smap_psock *psock, - struct sk_msg_buff *md) -{ - struct bpf_prog *prog; - unsigned int rc, _rc; - - preempt_disable(); - rcu_read_lock(); - - /* If the policy was removed mid-send then default to 'accept' */ - prog = READ_ONCE(psock->bpf_tx_msg); - if (unlikely(!prog)) { - _rc = SK_PASS; - goto verdict; - } - - bpf_compute_data_pointers_sg(md); - md->sk = sk; - rc = (*prog->bpf_func)(md, prog->insnsi); - psock->apply_bytes = md->apply_bytes; - - /* Moving return codes from UAPI namespace into internal namespace */ - _rc = bpf_map_msg_verdict(rc, md); - - /* The psock has a refcount on the sock but not on the map and because - * we need to drop rcu read lock here its possible the map could be - * removed between here and when we need it to execute the sock - * redirect. So do the map lookup now for future use. - */ - if (_rc == __SK_REDIRECT) { - if (psock->sk_redir) - sock_put(psock->sk_redir); - psock->sk_redir = do_msg_redirect_map(md); - if (!psock->sk_redir) { - _rc = __SK_DROP; - goto verdict; - } - sock_hold(psock->sk_redir); - } -verdict: - rcu_read_unlock(); - preempt_enable(); - - return _rc; -} - -static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, - struct smap_psock *psock, - struct sk_msg_buff *md, int flags) -{ - bool apply = apply_bytes; - size_t size, copied = 0; - struct sk_msg_buff *r; - int err = 0, i; - - r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL); - if (unlikely(!r)) - return -ENOMEM; - - lock_sock(sk); - r->sg_start = md->sg_start; - i = md->sg_start; - - do { - size = (apply && apply_bytes < md->sg_data[i].length) ? - apply_bytes : md->sg_data[i].length; - - if (!sk_wmem_schedule(sk, size)) { - if (!copied) - err = -ENOMEM; - break; - } - - sk_mem_charge(sk, size); - r->sg_data[i] = md->sg_data[i]; - r->sg_data[i].length = size; - md->sg_data[i].length -= size; - md->sg_data[i].offset += size; - copied += size; - - if (md->sg_data[i].length) { - get_page(sg_page(&r->sg_data[i])); - r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1; - } else { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - r->sg_end = i; - } - - if (apply) { - apply_bytes -= size; - if (!apply_bytes) - break; - } - } while (i != md->sg_end); - - md->sg_start = i; - - if (!err) { - list_add_tail(&r->list, &psock->ingress); - sk->sk_data_ready(sk); - } else { - free_start_sg(sk, r, true); - kfree(r); - } - - release_sock(sk); - return err; -} - -static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, - struct sk_msg_buff *md, - int flags) -{ - bool ingress = !!(md->flags & BPF_F_INGRESS); - struct smap_psock *psock; - int err = 0; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out_rcu; - - if (!refcount_inc_not_zero(&psock->refcnt)) - goto out_rcu; - - rcu_read_unlock(); - - if (ingress) { - err = bpf_tcp_ingress(sk, send, psock, md, flags); - } else { - lock_sock(sk); - err = bpf_tcp_push(sk, send, md, flags, false); - release_sock(sk); - } - smap_release_sock(psock, sk); - return err; -out_rcu: - rcu_read_unlock(); - return 0; -} - -static inline void bpf_md_init(struct smap_psock *psock) -{ - if (!psock->apply_bytes) { - psock->eval = __SK_NONE; - if (psock->sk_redir) { - sock_put(psock->sk_redir); - psock->sk_redir = NULL; - } - } -} - -static void apply_bytes_dec(struct smap_psock *psock, int i) -{ - if (psock->apply_bytes) { - if (psock->apply_bytes < i) - psock->apply_bytes = 0; - else - psock->apply_bytes -= i; - } -} - -static int bpf_exec_tx_verdict(struct smap_psock *psock, - struct sk_msg_buff *m, - struct sock *sk, - int *copied, int flags) -{ - bool cork = false, enospc = (m->sg_start == m->sg_end); - struct sock *redir; - int err = 0; - int send; - -more_data: - if (psock->eval == __SK_NONE) - psock->eval = smap_do_tx_msg(sk, psock, m); - - if (m->cork_bytes && - m->cork_bytes > psock->sg_size && !enospc) { - psock->cork_bytes = m->cork_bytes - psock->sg_size; - if (!psock->cork) { - psock->cork = kcalloc(1, - sizeof(struct sk_msg_buff), - GFP_ATOMIC | __GFP_NOWARN); - - if (!psock->cork) { - err = -ENOMEM; - goto out_err; - } - } - memcpy(psock->cork, m, sizeof(*m)); - goto out_err; - } - - send = psock->sg_size; - if (psock->apply_bytes && psock->apply_bytes < send) - send = psock->apply_bytes; - - switch (psock->eval) { - case __SK_PASS: - err = bpf_tcp_push(sk, send, m, flags, true); - if (unlikely(err)) { - *copied -= free_start_sg(sk, m, true); - break; - } - - apply_bytes_dec(psock, send); - psock->sg_size -= send; - break; - case __SK_REDIRECT: - redir = psock->sk_redir; - apply_bytes_dec(psock, send); - - if (psock->cork) { - cork = true; - psock->cork = NULL; - } - - return_mem_sg(sk, send, m); - release_sock(sk); - - err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); - lock_sock(sk); - - if (unlikely(err < 0)) { - int free = free_start_sg(sk, m, false); - - psock->sg_size = 0; - if (!cork) - *copied -= free; - } else { - psock->sg_size -= send; - } - - if (cork) { - free_start_sg(sk, m, true); - psock->sg_size = 0; - kfree(m); - m = NULL; - err = 0; - } - break; - case __SK_DROP: - default: - free_bytes_sg(sk, send, m, true); - apply_bytes_dec(psock, send); - *copied -= send; - psock->sg_size -= send; - err = -EACCES; - break; - } - - if (likely(!err)) { - bpf_md_init(psock); - if (m && - m->sg_data[m->sg_start].page_link && - m->sg_data[m->sg_start].length) - goto more_data; - } - -out_err: - return err; -} - -static int bpf_wait_data(struct sock *sk, - struct smap_psock *psk, int flags, - long timeo, int *err) -{ - int rc; - - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - rc = sk_wait_event(sk, &timeo, - !list_empty(&psk->ingress) || - !skb_queue_empty(&sk->sk_receive_queue), - &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - - return rc; -} - -static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len) -{ - struct iov_iter *iter = &msg->msg_iter; - struct smap_psock *psock; - int copied = 0; - - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); - if (!skb_queue_empty(&sk->sk_receive_queue)) - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto out; - - if (unlikely(!refcount_inc_not_zero(&psock->refcnt))) - goto out; - rcu_read_unlock(); - - lock_sock(sk); -bytes_ready: - while (copied != len) { - struct scatterlist *sg; - struct sk_msg_buff *md; - int i; - - md = list_first_entry_or_null(&psock->ingress, - struct sk_msg_buff, list); - if (unlikely(!md)) - break; - i = md->sg_start; - do { - struct page *page; - int n, copy; - - sg = &md->sg_data[i]; - copy = sg->length; - page = sg_page(sg); - - if (copied + copy > len) - copy = len - copied; - - n = copy_page_to_iter(page, sg->offset, copy, iter); - if (n != copy) { - md->sg_start = i; - release_sock(sk); - smap_release_sock(psock, sk); - return -EFAULT; - } - - copied += copy; - sg->offset += copy; - sg->length -= copy; - sk_mem_uncharge(sk, copy); - - if (!sg->length) { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - if (!md->skb) - put_page(page); - } - if (copied == len) - break; - } while (i != md->sg_end); - md->sg_start = i; - - if (!sg->length && md->sg_start == md->sg_end) { - list_del(&md->list); - consume_skb(md->skb); - kfree(md); - } - } - - if (!copied) { - long timeo; - int data; - int err = 0; - - timeo = sock_rcvtimeo(sk, nonblock); - data = bpf_wait_data(sk, psock, flags, timeo, &err); - - if (data) { - if (!skb_queue_empty(&sk->sk_receive_queue)) { - release_sock(sk); - smap_release_sock(psock, sk); - copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - return copied; - } - goto bytes_ready; - } - - if (err) - copied = err; - } - - release_sock(sk); - smap_release_sock(psock, sk); - return copied; -out: - rcu_read_unlock(); - return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); -} - - -static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) -{ - int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; - struct sk_msg_buff md = {0}; - unsigned int sg_copy = 0; - struct smap_psock *psock; - int copied = 0, err = 0; - struct scatterlist *sg; - long timeo; - - /* Its possible a sock event or user removed the psock _but_ the ops - * have not been reprogrammed yet so we get here. In this case fallback - * to tcp_sendmsg. Note this only works because we _only_ ever allow - * a single ULP there is no hierarchy here. - */ - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return tcp_sendmsg(sk, msg, size); - } - - /* Increment the psock refcnt to ensure its not released while sending a - * message. Required because sk lookup and bpf programs are used in - * separate rcu critical sections. Its OK if we lose the map entry - * but we can't lose the sock reference. - */ - if (!refcount_inc_not_zero(&psock->refcnt)) { - rcu_read_unlock(); - return tcp_sendmsg(sk, msg, size); - } - - sg = md.sg_data; - sg_init_marker(sg, MAX_SKB_FRAGS); - rcu_read_unlock(); - - lock_sock(sk); - timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - - while (msg_data_left(msg)) { - struct sk_msg_buff *m = NULL; - bool enospc = false; - int copy; - - if (sk->sk_err) { - err = -sk->sk_err; - goto out_err; - } - - copy = msg_data_left(msg); - if (!sk_stream_memory_free(sk)) - goto wait_for_sndbuf; - - m = psock->cork_bytes ? psock->cork : &md; - m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; - err = sk_alloc_sg(sk, copy, m->sg_data, - m->sg_start, &m->sg_end, &sg_copy, - m->sg_end - 1); - if (err) { - if (err != -ENOSPC) - goto wait_for_memory; - enospc = true; - copy = sg_copy; - } - - err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); - if (err < 0) { - free_curr_sg(sk, m); - goto out_err; - } - - psock->sg_size += copy; - copied += copy; - sg_copy = 0; - - /* When bytes are being corked skip running BPF program and - * applying verdict unless there is no more buffer space. In - * the ENOSPC case simply run BPF prorgram with currently - * accumulated data. We don't have much choice at this point - * we could try extending the page frags or chaining complex - * frags but even in these cases _eventually_ we will hit an - * OOM scenario. More complex recovery schemes may be - * implemented in the future, but BPF programs must handle - * the case where apply_cork requests are not honored. The - * canonical method to verify this is to check data length. - */ - if (psock->cork_bytes) { - if (copy > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= copy; - - if (psock->cork_bytes && !enospc) - goto out_cork; - - /* All cork bytes accounted for re-run filter */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); - if (unlikely(err < 0)) - goto out_err; - continue; -wait_for_sndbuf: - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -wait_for_memory: - err = sk_stream_wait_memory(sk, &timeo); - if (err) { - if (m && m != psock->cork) - free_start_sg(sk, m, true); - goto out_err; - } - } -out_err: - if (err < 0) - err = sk_stream_error(sk, msg->msg_flags, err); -out_cork: - release_sock(sk); - smap_release_sock(psock, sk); - return copied ? copied : err; -} - -static int bpf_tcp_sendpage(struct sock *sk, struct page *page, - int offset, size_t size, int flags) -{ - struct sk_msg_buff md = {0}, *m = NULL; - int err = 0, copied = 0; - struct smap_psock *psock; - struct scatterlist *sg; - bool enospc = false; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - goto accept; - - if (!refcount_inc_not_zero(&psock->refcnt)) - goto accept; - rcu_read_unlock(); - - lock_sock(sk); - - if (psock->cork_bytes) { - m = psock->cork; - sg = &m->sg_data[m->sg_end]; - } else { - m = &md; - sg = m->sg_data; - sg_init_marker(sg, MAX_SKB_FRAGS); - } - - /* Catch case where ring is full and sendpage is stalled. */ - if (unlikely(m->sg_end == m->sg_start && - m->sg_data[m->sg_end].length)) - goto out_err; - - psock->sg_size += size; - sg_set_page(sg, page, size, offset); - get_page(page); - m->sg_copy[m->sg_end] = true; - sk_mem_charge(sk, size); - m->sg_end++; - copied = size; - - if (m->sg_end == MAX_SKB_FRAGS) - m->sg_end = 0; - - if (m->sg_end == m->sg_start) - enospc = true; - - if (psock->cork_bytes) { - if (size > psock->cork_bytes) - psock->cork_bytes = 0; - else - psock->cork_bytes -= size; - - if (psock->cork_bytes && !enospc) - goto out_err; - - /* All cork bytes accounted for re-run filter */ - psock->eval = __SK_NONE; - psock->cork_bytes = 0; - } - - err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); -out_err: - release_sock(sk); - smap_release_sock(psock, sk); - return copied ? copied : err; -accept: - rcu_read_unlock(); - return tcp_sendpage(sk, page, offset, size, flags); -} - -static void bpf_tcp_msg_add(struct smap_psock *psock, - struct sock *sk, - struct bpf_prog *tx_msg) -{ - struct bpf_prog *orig_tx_msg; - - orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); - if (orig_tx_msg) - bpf_prog_put(orig_tx_msg); -} - -static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) -{ - struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); - int rc; - - if (unlikely(!prog)) - return __SK_DROP; - - skb_orphan(skb); - /* We need to ensure that BPF metadata for maps is also cleared - * when we orphan the skb so that we don't have the possibility - * to reference a stale map. - */ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; - skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); - preempt_disable(); - rc = (*prog->bpf_func)(skb, prog->insnsi); - preempt_enable(); - skb->sk = NULL; - - /* Moving return codes from UAPI namespace into internal namespace */ - return rc == SK_PASS ? - (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) : - __SK_DROP; -} - -static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) -{ - struct sock *sk = psock->sock; - int copied = 0, num_sg; - struct sk_msg_buff *r; - - r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); - if (unlikely(!r)) - return -EAGAIN; - - if (!sk_rmem_schedule(sk, skb, skb->len)) { - kfree(r); - return -EAGAIN; - } - - sg_init_table(r->sg_data, MAX_SKB_FRAGS); - num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); - if (unlikely(num_sg < 0)) { - kfree(r); - return num_sg; - } - sk_mem_charge(sk, skb->len); - copied = skb->len; - r->sg_start = 0; - r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; - r->skb = skb; - list_add_tail(&r->list, &psock->ingress); - sk->sk_data_ready(sk); - return copied; -} - -static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) -{ - struct smap_psock *peer; - struct sock *sk; - __u32 in; - int rc; - - rc = smap_verdict_func(psock, skb); - switch (rc) { - case __SK_REDIRECT: - sk = do_sk_redirect_map(skb); - if (!sk) { - kfree_skb(skb); - break; - } - - peer = smap_psock_sk(sk); - in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; - - if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) || - !test_bit(SMAP_TX_RUNNING, &peer->state))) { - kfree_skb(skb); - break; - } - - if (!in && sock_writeable(sk)) { - skb_set_owner_w(skb, sk); - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } else if (in && - atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } - /* Fall through and free skb otherwise */ - case __SK_DROP: - default: - kfree_skb(skb); - } -} - -static void smap_report_sk_error(struct smap_psock *psock, int err) -{ - struct sock *sk = psock->sock; - - sk->sk_err = err; - sk->sk_error_report(sk); -} - -static void smap_read_sock_strparser(struct strparser *strp, - struct sk_buff *skb) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = container_of(strp, struct smap_psock, strp); - smap_do_verdict(psock, skb); - rcu_read_unlock(); -} - -/* Called with lock held on socket */ -static void smap_data_ready(struct sock *sk) -{ - struct smap_psock *psock; - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (likely(psock)) { - write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->strp); - write_unlock_bh(&sk->sk_callback_lock); - } - rcu_read_unlock(); -} - -static void smap_tx_work(struct work_struct *w) -{ - struct smap_psock *psock; - struct sk_buff *skb; - int rem, off, n; - - psock = container_of(w, struct smap_psock, tx_work); - - /* lock sock to avoid losing sk_socket at some point during loop */ - lock_sock(psock->sock); - if (psock->save_skb) { - skb = psock->save_skb; - rem = psock->save_rem; - off = psock->save_off; - psock->save_skb = NULL; - goto start; - } - - while ((skb = skb_dequeue(&psock->rxqueue))) { - __u32 flags; - - rem = skb->len; - off = 0; -start: - flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; - do { - if (likely(psock->sock->sk_socket)) { - if (flags) - n = smap_do_ingress(psock, skb); - else - n = skb_send_sock_locked(psock->sock, - skb, off, rem); - } else { - n = -EINVAL; - } - - if (n <= 0) { - if (n == -EAGAIN) { - /* Retry when space is available */ - psock->save_skb = skb; - psock->save_rem = rem; - psock->save_off = off; - goto out; - } - /* Hard errors break pipe and stop xmit */ - smap_report_sk_error(psock, n ? -n : EPIPE); - clear_bit(SMAP_TX_RUNNING, &psock->state); - kfree_skb(skb); - goto out; - } - rem -= n; - off += n; - } while (rem); - - if (!flags) - kfree_skb(skb); - } -out: - release_sock(psock->sock); -} - -static void smap_write_space(struct sock *sk) -{ - struct smap_psock *psock; - void (*write_space)(struct sock *sk); - - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state))) - schedule_work(&psock->tx_work); - write_space = psock->save_write_space; - rcu_read_unlock(); - write_space(sk); -} - -static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) -{ - if (!psock->strp_enabled) - return; - sk->sk_data_ready = psock->save_data_ready; - sk->sk_write_space = psock->save_write_space; - psock->save_data_ready = NULL; - psock->save_write_space = NULL; - strp_stop(&psock->strp); - psock->strp_enabled = false; -} - -static void smap_destroy_psock(struct rcu_head *rcu) -{ - struct smap_psock *psock = container_of(rcu, - struct smap_psock, rcu); - - /* Now that a grace period has passed there is no longer - * any reference to this sock in the sockmap so we can - * destroy the psock, strparser, and bpf programs. But, - * because we use workqueue sync operations we can not - * do it in rcu context - */ - schedule_work(&psock->gc_work); -} - -static bool psock_is_smap_sk(struct sock *sk) -{ - return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops; -} - -static void smap_release_sock(struct smap_psock *psock, struct sock *sock) -{ - if (refcount_dec_and_test(&psock->refcnt)) { - if (psock_is_smap_sk(sock)) - bpf_tcp_release(sock); - write_lock_bh(&sock->sk_callback_lock); - smap_stop_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - clear_bit(SMAP_TX_RUNNING, &psock->state); - rcu_assign_sk_user_data(sock, NULL); - call_rcu_sched(&psock->rcu, smap_destroy_psock); - } -} - -static int smap_parse_func_strparser(struct strparser *strp, - struct sk_buff *skb) -{ - struct smap_psock *psock; - struct bpf_prog *prog; - int rc; - - rcu_read_lock(); - psock = container_of(strp, struct smap_psock, strp); - prog = READ_ONCE(psock->bpf_parse); - - if (unlikely(!prog)) { - rcu_read_unlock(); - return skb->len; - } - - /* Attach socket for bpf program to use if needed we can do this - * because strparser clones the skb before handing it to a upper - * layer, meaning skb_orphan has been called. We NULL sk on the - * way out to ensure we don't trigger a BUG_ON in skb/sk operations - * later and because we are not charging the memory of this skb to - * any socket yet. - */ - skb->sk = psock->sock; - bpf_compute_data_end_sk_skb(skb); - rc = (*prog->bpf_func)(skb, prog->insnsi); - skb->sk = NULL; - rcu_read_unlock(); - return rc; -} - -static int smap_read_sock_done(struct strparser *strp, int err) -{ - return err; -} - -static int smap_init_sock(struct smap_psock *psock, - struct sock *sk) -{ - static const struct strp_callbacks cb = { - .rcv_msg = smap_read_sock_strparser, - .parse_msg = smap_parse_func_strparser, - .read_sock_done = smap_read_sock_done, - }; - - return strp_init(&psock->strp, sk, &cb); -} - -static void smap_init_progs(struct smap_psock *psock, - struct bpf_prog *verdict, - struct bpf_prog *parse) -{ - struct bpf_prog *orig_parse, *orig_verdict; - - orig_parse = xchg(&psock->bpf_parse, parse); - orig_verdict = xchg(&psock->bpf_verdict, verdict); - - if (orig_verdict) - bpf_prog_put(orig_verdict); - if (orig_parse) - bpf_prog_put(orig_parse); -} - -static void smap_start_sock(struct smap_psock *psock, struct sock *sk) -{ - if (sk->sk_data_ready == smap_data_ready) - return; - psock->save_data_ready = sk->sk_data_ready; - psock->save_write_space = sk->sk_write_space; - sk->sk_data_ready = smap_data_ready; - sk->sk_write_space = smap_write_space; - psock->strp_enabled = true; -} - -static void sock_map_remove_complete(struct bpf_stab *stab) -{ - bpf_map_area_free(stab->sock_map); - kfree(stab); -} - -static void smap_gc_work(struct work_struct *w) -{ - struct smap_psock_map_entry *e, *tmp; - struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; - - psock = container_of(w, struct smap_psock, gc_work); - - /* no callback lock needed because we already detached sockmap ops */ - if (psock->strp_enabled) - strp_done(&psock->strp); - - cancel_work_sync(&psock->tx_work); - __skb_queue_purge(&psock->rxqueue); - - /* At this point all strparser and xmit work must be complete */ - if (psock->bpf_parse) - bpf_prog_put(psock->bpf_parse); - if (psock->bpf_verdict) - bpf_prog_put(psock->bpf_verdict); - if (psock->bpf_tx_msg) - bpf_prog_put(psock->bpf_tx_msg); - - if (psock->cork) { - free_start_sg(psock->sock, psock->cork, true); - kfree(psock->cork); - } - - list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { - list_del(&md->list); - free_start_sg(psock->sock, md, true); - kfree(md); - } - - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - list_del(&e->list); - kfree(e); - } - - if (psock->sk_redir) - sock_put(psock->sk_redir); - - sock_put(psock->sock); - kfree(psock); -} - -static struct smap_psock *smap_init_psock(struct sock *sock, int node) -{ - struct smap_psock *psock; - - psock = kzalloc_node(sizeof(struct smap_psock), - GFP_ATOMIC | __GFP_NOWARN, - node); - if (!psock) - return ERR_PTR(-ENOMEM); - - psock->eval = __SK_NONE; - psock->sock = sock; - skb_queue_head_init(&psock->rxqueue); - INIT_WORK(&psock->tx_work, smap_tx_work); - INIT_WORK(&psock->gc_work, smap_gc_work); - INIT_LIST_HEAD(&psock->maps); - INIT_LIST_HEAD(&psock->ingress); - refcount_set(&psock->refcnt, 1); - spin_lock_init(&psock->maps_lock); - - rcu_assign_sk_user_data(sock, psock); - sock_hold(sock); - return psock; -} - -static struct bpf_map *sock_map_alloc(union bpf_attr *attr) -{ - struct bpf_stab *stab; - u64 cost; - int err; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - /* check sanity of attributes */ - if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); - - stab = kzalloc(sizeof(*stab), GFP_USER); - if (!stab) - return ERR_PTR(-ENOMEM); - - bpf_map_init_from_attr(&stab->map, attr); - raw_spin_lock_init(&stab->lock); - - /* make sure page count doesn't overflow */ - cost = (u64) stab->map.max_entries * sizeof(struct sock *); - err = -EINVAL; - if (cost >= U32_MAX - PAGE_SIZE) - goto free_stab; - - stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(stab->map.pages); - if (err) - goto free_stab; - - err = -ENOMEM; - stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * - sizeof(struct sock *), - stab->map.numa_node); - if (!stab->sock_map) - goto free_stab; - - return &stab->map; -free_stab: - kfree(stab); - return ERR_PTR(err); -} - -static void smap_list_map_remove(struct smap_psock *psock, - struct sock **entry) -{ - struct smap_psock_map_entry *e, *tmp; - - spin_lock_bh(&psock->maps_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) { - list_del(&e->list); - kfree(e); - } - } - spin_unlock_bh(&psock->maps_lock); -} - -static void smap_list_hash_remove(struct smap_psock *psock, - struct htab_elem *hash_link) -{ - struct smap_psock_map_entry *e, *tmp; - - spin_lock_bh(&psock->maps_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - struct htab_elem *c = rcu_dereference(e->hash_link); - - if (c == hash_link) { - list_del(&e->list); - kfree(e); - } - } - spin_unlock_bh(&psock->maps_lock); -} - -static void sock_map_free(struct bpf_map *map) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - int i; - - synchronize_rcu(); - - /* At this point no update, lookup or delete operations can happen. - * However, be aware we can still get a socket state event updates, - * and data ready callabacks that reference the psock from sk_user_data - * Also psock worker threads are still in-flight. So smap_release_sock - * will only free the psock after cancel_sync on the worker threads - * and a grace period expire to ensure psock is really safe to remove. - */ - rcu_read_lock(); - raw_spin_lock_bh(&stab->lock); - for (i = 0; i < stab->map.max_entries; i++) { - struct smap_psock *psock; - struct sock *sock; - - sock = stab->sock_map[i]; - if (!sock) - continue; - stab->sock_map[i] = NULL; - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get the - * sk_callback_lock before this case but after xchg happens - * causing the refcnt to hit zero and sock user data (psock) - * to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_map_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, sock); - } - } - raw_spin_unlock_bh(&stab->lock); - rcu_read_unlock(); - - sock_map_remove_complete(stab); -} - -static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - u32 i = key ? *(u32 *)key : U32_MAX; - u32 *next = (u32 *)next_key; - - if (i >= stab->map.max_entries) { - *next = 0; - return 0; - } - - if (i == stab->map.max_entries - 1) - return -ENOENT; - - *next = i + 1; - return 0; -} - -struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - if (key >= map->max_entries) - return NULL; - - return READ_ONCE(stab->sock_map[key]); -} - -static int sock_map_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct smap_psock *psock; - int k = *(u32 *)key; - struct sock *sock; - - if (k >= map->max_entries) - return -EINVAL; - - raw_spin_lock_bh(&stab->lock); - sock = stab->sock_map[k]; - stab->sock_map[k] = NULL; - raw_spin_unlock_bh(&stab->lock); - if (!sock) - return -EINVAL; - - psock = smap_psock_sk(sock); - if (!psock) - return 0; - if (psock->bpf_parse) { - write_lock_bh(&sock->sk_callback_lock); - smap_stop_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - } - smap_list_map_remove(psock, &stab->sock_map[k]); - smap_release_sock(psock, sock); - return 0; -} - -/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are - * done inside rcu critical sections. This ensures on updates that the psock - * will not be released via smap_release_sock() until concurrent updates/deletes - * complete. All operations operate on sock_map using cmpxchg and xchg - * operations to ensure we do not get stale references. Any reads into the - * map must be done with READ_ONCE() because of this. - * - * A psock is destroyed via call_rcu and after any worker threads are cancelled - * and syncd so we are certain all references from the update/lookup/delete - * operations as well as references in the data path are no longer in use. - * - * Psocks may exist in multiple maps, but only a single set of parse/verdict - * programs may be inherited from the maps it belongs to. A reference count - * is kept with the total number of references to the psock from all maps. The - * psock will not be released until this reaches zero. The psock and sock - * user data data use the sk_callback_lock to protect critical data structures - * from concurrent access. This allows us to avoid two updates from modifying - * the user data in sock and the lock is required anyways for modifying - * callbacks, we simply increase its scope slightly. - * - * Rules to follow, - * - psock must always be read inside RCU critical section - * - sk_user_data must only be modified inside sk_callback_lock and read - * inside RCU critical section. - * - psock->maps list must only be read & modified inside sk_callback_lock - * - sock_map must use READ_ONCE and (cmp)xchg operations - * - BPF verdict/parse programs must use READ_ONCE and xchg operations - */ - -static int __sock_map_ctx_update_elem(struct bpf_map *map, - struct bpf_sock_progs *progs, - struct sock *sock, - void *key) -{ - struct bpf_prog *verdict, *parse, *tx_msg; - struct smap_psock *psock; - bool new = false; - int err = 0; - - /* 1. If sock map has BPF programs those will be inherited by the - * sock being added. If the sock is already attached to BPF programs - * this results in an error. - */ - verdict = READ_ONCE(progs->bpf_verdict); - parse = READ_ONCE(progs->bpf_parse); - tx_msg = READ_ONCE(progs->bpf_tx_msg); - - if (parse && verdict) { - /* bpf prog refcnt may be zero if a concurrent attach operation - * removes the program after the above READ_ONCE() but before - * we increment the refcnt. If this is the case abort with an - * error. - */ - verdict = bpf_prog_inc_not_zero(verdict); - if (IS_ERR(verdict)) - return PTR_ERR(verdict); - - parse = bpf_prog_inc_not_zero(parse); - if (IS_ERR(parse)) { - bpf_prog_put(verdict); - return PTR_ERR(parse); - } - } - - if (tx_msg) { - tx_msg = bpf_prog_inc_not_zero(tx_msg); - if (IS_ERR(tx_msg)) { - if (parse && verdict) { - bpf_prog_put(parse); - bpf_prog_put(verdict); - } - return PTR_ERR(tx_msg); - } - } - - psock = smap_psock_sk(sock); - - /* 2. Do not allow inheriting programs if psock exists and has - * already inherited programs. This would create confusion on - * which parser/verdict program is running. If no psock exists - * create one. Inside sk_callback_lock to ensure concurrent create - * doesn't update user data. - */ - if (psock) { - if (!psock_is_smap_sk(sock)) { - err = -EBUSY; - goto out_progs; - } - if (READ_ONCE(psock->bpf_parse) && parse) { - err = -EBUSY; - goto out_progs; - } - if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { - err = -EBUSY; - goto out_progs; - } - if (!refcount_inc_not_zero(&psock->refcnt)) { - err = -EAGAIN; - goto out_progs; - } - } else { - psock = smap_init_psock(sock, map->numa_node); - if (IS_ERR(psock)) { - err = PTR_ERR(psock); - goto out_progs; - } - - set_bit(SMAP_TX_RUNNING, &psock->state); - new = true; - } - - /* 3. At this point we have a reference to a valid psock that is - * running. Attach any BPF programs needed. - */ - if (tx_msg) - bpf_tcp_msg_add(psock, sock, tx_msg); - if (new) { - err = bpf_tcp_init(sock); - if (err) - goto out_free; - } - - if (parse && verdict && !psock->strp_enabled) { - err = smap_init_sock(psock, sock); - if (err) - goto out_free; - smap_init_progs(psock, verdict, parse); - write_lock_bh(&sock->sk_callback_lock); - smap_start_sock(psock, sock); - write_unlock_bh(&sock->sk_callback_lock); - } - - return err; -out_free: - smap_release_sock(psock, sock); -out_progs: - if (parse && verdict) { - bpf_prog_put(parse); - bpf_prog_put(verdict); - } - if (tx_msg) - bpf_prog_put(tx_msg); - return err; -} - -static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 flags) -{ - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct bpf_sock_progs *progs = &stab->progs; - struct sock *osock, *sock = skops->sk; - struct smap_psock_map_entry *e; - struct smap_psock *psock; - u32 i = *(u32 *)key; - int err; - - if (unlikely(flags > BPF_EXIST)) - return -EINVAL; - if (unlikely(i >= stab->map.max_entries)) - return -E2BIG; - - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) - return -ENOMEM; - - err = __sock_map_ctx_update_elem(map, progs, sock, key); - if (err) - goto out; - - /* psock guaranteed to be present. */ - psock = smap_psock_sk(sock); - raw_spin_lock_bh(&stab->lock); - osock = stab->sock_map[i]; - if (osock && flags == BPF_NOEXIST) { - err = -EEXIST; - goto out_unlock; - } - if (!osock && flags == BPF_EXIST) { - err = -ENOENT; - goto out_unlock; - } - - e->entry = &stab->sock_map[i]; - e->map = map; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - - stab->sock_map[i] = sock; - if (osock) { - psock = smap_psock_sk(osock); - smap_list_map_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, osock); - } - raw_spin_unlock_bh(&stab->lock); - return 0; -out_unlock: - smap_release_sock(psock, sock); - raw_spin_unlock_bh(&stab->lock); -out: - kfree(e); - return err; -} - -int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) -{ - struct bpf_sock_progs *progs; - struct bpf_prog *orig; - - if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - progs = &stab->progs; - } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - - progs = &htab->progs; - } else { - return -EINVAL; - } - - switch (type) { - case BPF_SK_MSG_VERDICT: - orig = xchg(&progs->bpf_tx_msg, prog); - break; - case BPF_SK_SKB_STREAM_PARSER: - orig = xchg(&progs->bpf_parse, prog); - break; - case BPF_SK_SKB_STREAM_VERDICT: - orig = xchg(&progs->bpf_verdict, prog); - break; - default: - return -EOPNOTSUPP; - } - - if (orig) - bpf_prog_put(orig); - - return 0; -} - -int sockmap_get_from_fd(const union bpf_attr *attr, int type, - struct bpf_prog *prog) -{ - int ufd = attr->target_fd; - struct bpf_map *map; - struct fd f; - int err; - - f = fdget(ufd); - map = __bpf_map_get(f); - if (IS_ERR(map)) - return PTR_ERR(map); - - err = sock_map_prog(map, prog, attr->attach_type); - fdput(f); - return err; -} - -static void *sock_map_lookup(struct bpf_map *map, void *key) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static int sock_map_update_elem(struct bpf_map *map, - void *key, void *value, u64 flags) -{ - struct bpf_sock_ops_kern skops; - u32 fd = *(u32 *)value; - struct socket *socket; - int err; - - socket = sockfd_lookup(fd, &err); - if (!socket) - return err; - - skops.sk = socket->sk; - if (!skops.sk) { - fput(socket->file); - return -EINVAL; - } - - /* ULPs are currently supported only for TCP sockets in ESTABLISHED - * state. - */ - if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP || - skops.sk->sk_state != TCP_ESTABLISHED) { - fput(socket->file); - return -EOPNOTSUPP; - } - - lock_sock(skops.sk); - preempt_disable(); - rcu_read_lock(); - err = sock_map_ctx_update_elem(&skops, map, key, flags); - rcu_read_unlock(); - preempt_enable(); - release_sock(skops.sk); - fput(socket->file); - return err; -} - -static void sock_map_release(struct bpf_map *map) -{ - struct bpf_sock_progs *progs; - struct bpf_prog *orig; - - if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - - progs = &stab->progs; - } else { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - - progs = &htab->progs; - } - - orig = xchg(&progs->bpf_parse, NULL); - if (orig) - bpf_prog_put(orig); - orig = xchg(&progs->bpf_verdict, NULL); - if (orig) - bpf_prog_put(orig); - - orig = xchg(&progs->bpf_tx_msg, NULL); - if (orig) - bpf_prog_put(orig); -} - -static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) -{ - struct bpf_htab *htab; - int i, err; - u64 cost; - - if (!capable(CAP_NET_ADMIN)) - return ERR_PTR(-EPERM); - - /* check sanity of attributes */ - if (attr->max_entries == 0 || - attr->key_size == 0 || - attr->value_size != 4 || - attr->map_flags & ~SOCK_CREATE_FLAG_MASK) - return ERR_PTR(-EINVAL); - - if (attr->key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - return ERR_PTR(-E2BIG); - - htab = kzalloc(sizeof(*htab), GFP_USER); - if (!htab) - return ERR_PTR(-ENOMEM); - - bpf_map_init_from_attr(&htab->map, attr); - - htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); - htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8); - err = -EINVAL; - if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct bucket)) - goto free_htab; - - cost = (u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries; - - if (cost >= U32_MAX - PAGE_SIZE) - goto free_htab; - - htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - err = bpf_map_precharge_memlock(htab->map.pages); - if (err) - goto free_htab; - - err = -ENOMEM; - htab->buckets = bpf_map_area_alloc( - htab->n_buckets * sizeof(struct bucket), - htab->map.numa_node); - if (!htab->buckets) - goto free_htab; - - for (i = 0; i < htab->n_buckets; i++) { - INIT_HLIST_HEAD(&htab->buckets[i].head); - raw_spin_lock_init(&htab->buckets[i].lock); - } - - return &htab->map; -free_htab: - kfree(htab); - return ERR_PTR(err); -} - -static void __bpf_htab_free(struct rcu_head *rcu) -{ - struct bpf_htab *htab; - - htab = container_of(rcu, struct bpf_htab, rcu); - bpf_map_area_free(htab->buckets); - kfree(htab); -} - -static void sock_hash_free(struct bpf_map *map) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - int i; - - synchronize_rcu(); - - /* At this point no update, lookup or delete operations can happen. - * However, be aware we can still get a socket state event updates, - * and data ready callabacks that reference the psock from sk_user_data - * Also psock worker threads are still in-flight. So smap_release_sock - * will only free the psock after cancel_sync on the worker threads - * and a grace period expire to ensure psock is really safe to remove. - */ - rcu_read_lock(); - for (i = 0; i < htab->n_buckets; i++) { - struct bucket *b = __select_bucket(htab, i); - struct hlist_head *head; - struct hlist_node *n; - struct htab_elem *l; - - raw_spin_lock_bh(&b->lock); - head = &b->head; - hlist_for_each_entry_safe(l, n, head, hash_node) { - struct sock *sock = l->sk; - struct smap_psock *psock; - - hlist_del_rcu(&l->hash_node); - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get - * the sk_callback_lock before this case but after xchg - * causing the refcnt to hit zero and sock user data - * (psock) to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_hash_remove(psock, l); - smap_release_sock(psock, sock); - } - free_htab_elem(htab, l); - } - raw_spin_unlock_bh(&b->lock); - } - rcu_read_unlock(); - call_rcu(&htab->rcu, __bpf_htab_free); -} - -static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, - void *key, u32 key_size, u32 hash, - struct sock *sk, - struct htab_elem *old_elem) -{ - struct htab_elem *l_new; - - if (atomic_inc_return(&htab->count) > htab->map.max_entries) { - if (!old_elem) { - atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); - } - } - l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); - if (!l_new) { - atomic_dec(&htab->count); - return ERR_PTR(-ENOMEM); - } - - memcpy(l_new->key, key, key_size); - l_new->sk = sk; - l_new->hash = hash; - return l_new; -} - -static inline u32 htab_map_hash(const void *key, u32 key_len) -{ - return jhash(key, key_len, 0); -} - -static int sock_hash_get_next_key(struct bpf_map *map, - void *key, void *next_key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l, *next_l; - struct hlist_head *h; - u32 hash, key_size; - int i = 0; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - key_size = map->key_size; - if (!key) - goto find_first_elem; - hash = htab_map_hash(key, key_size); - h = select_bucket(htab, hash); - - l = lookup_elem_raw(h, hash, key, key_size); - if (!l) - goto find_first_elem; - next_l = hlist_entry_safe( - rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), - struct htab_elem, hash_node); - if (next_l) { - memcpy(next_key, next_l->key, key_size); - return 0; - } - - /* no more elements in this hash list, go to the next bucket */ - i = hash & (htab->n_buckets - 1); - i++; - -find_first_elem: - /* iterate over buckets */ - for (; i < htab->n_buckets; i++) { - h = select_bucket(htab, i); - - /* pick first element in the bucket */ - next_l = hlist_entry_safe( - rcu_dereference_raw(hlist_first_rcu(h)), - struct htab_elem, hash_node); - if (next_l) { - /* if it's not empty, just return it */ - memcpy(next_key, next_l->key, key_size); - return 0; - } - } - - /* iterated over all buckets and all elements */ - return -ENOENT; -} - -static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 map_flags) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct bpf_sock_progs *progs = &htab->progs; - struct htab_elem *l_new = NULL, *l_old; - struct smap_psock_map_entry *e = NULL; - struct hlist_head *head; - struct smap_psock *psock; - u32 key_size, hash; - struct sock *sock; - struct bucket *b; - int err; - - sock = skops->sk; - - if (sock->sk_type != SOCK_STREAM || - sock->sk_protocol != IPPROTO_TCP) - return -EOPNOTSUPP; - - if (unlikely(map_flags > BPF_EXIST)) - return -EINVAL; - - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) - return -ENOMEM; - - WARN_ON_ONCE(!rcu_read_lock_held()); - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - err = __sock_map_ctx_update_elem(map, progs, sock, key); - if (err) - goto err; - - /* psock is valid here because otherwise above *ctx_update_elem would - * have thrown an error. It is safe to skip error check. - */ - psock = smap_psock_sk(sock); - raw_spin_lock_bh(&b->lock); - l_old = lookup_elem_raw(head, hash, key, key_size); - if (l_old && map_flags == BPF_NOEXIST) { - err = -EEXIST; - goto bucket_err; - } - if (!l_old && map_flags == BPF_EXIST) { - err = -ENOENT; - goto bucket_err; - } - - l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old); - if (IS_ERR(l_new)) { - err = PTR_ERR(l_new); - goto bucket_err; - } - - rcu_assign_pointer(e->hash_link, l_new); - e->map = map; - spin_lock_bh(&psock->maps_lock); - list_add_tail(&e->list, &psock->maps); - spin_unlock_bh(&psock->maps_lock); - - /* add new element to the head of the list, so that - * concurrent search will find it before old elem - */ - hlist_add_head_rcu(&l_new->hash_node, head); - if (l_old) { - psock = smap_psock_sk(l_old->sk); - - hlist_del_rcu(&l_old->hash_node); - smap_list_hash_remove(psock, l_old); - smap_release_sock(psock, l_old->sk); - free_htab_elem(htab, l_old); - } - raw_spin_unlock_bh(&b->lock); - return 0; -bucket_err: - smap_release_sock(psock, sock); - raw_spin_unlock_bh(&b->lock); -err: - kfree(e); - return err; -} - -static int sock_hash_update_elem(struct bpf_map *map, - void *key, void *value, u64 flags) -{ - struct bpf_sock_ops_kern skops; - u32 fd = *(u32 *)value; - struct socket *socket; - int err; - - socket = sockfd_lookup(fd, &err); - if (!socket) - return err; - - skops.sk = socket->sk; - if (!skops.sk) { - fput(socket->file); - return -EINVAL; - } - - /* ULPs are currently supported only for TCP sockets in ESTABLISHED - * state. - */ - if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP || - skops.sk->sk_state != TCP_ESTABLISHED) { - fput(socket->file); - return -EOPNOTSUPP; - } - - lock_sock(skops.sk); - preempt_disable(); - rcu_read_lock(); - err = sock_hash_ctx_update_elem(&skops, map, key, flags); - rcu_read_unlock(); - preempt_enable(); - release_sock(skops.sk); - fput(socket->file); - return err; -} - -static int sock_hash_delete_elem(struct bpf_map *map, void *key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; - struct bucket *b; - struct htab_elem *l; - u32 hash, key_size; - int ret = -ENOENT; - - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - raw_spin_lock_bh(&b->lock); - l = lookup_elem_raw(head, hash, key, key_size); - if (l) { - struct sock *sock = l->sk; - struct smap_psock *psock; - - hlist_del_rcu(&l->hash_node); - psock = smap_psock_sk(sock); - /* This check handles a racing sock event that can get the - * sk_callback_lock before this case but after xchg happens - * causing the refcnt to hit zero and sock user data (psock) - * to be null and queued for garbage collection. - */ - if (likely(psock)) { - smap_list_hash_remove(psock, l); - smap_release_sock(psock, sock); - } - free_htab_elem(htab, l); - ret = 0; - } - raw_spin_unlock_bh(&b->lock); - return ret; -} - -struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) -{ - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct hlist_head *head; - struct htab_elem *l; - u32 key_size, hash; - struct bucket *b; - struct sock *sk; - - key_size = map->key_size; - hash = htab_map_hash(key, key_size); - b = __select_bucket(htab, hash); - head = &b->head; - - l = lookup_elem_raw(head, hash, key, key_size); - sk = l ? l->sk : NULL; - return sk; -} - -const struct bpf_map_ops sock_map_ops = { - .map_alloc = sock_map_alloc, - .map_free = sock_map_free, - .map_lookup_elem = sock_map_lookup, - .map_get_next_key = sock_map_get_next_key, - .map_update_elem = sock_map_update_elem, - .map_delete_elem = sock_map_delete_elem, - .map_release_uref = sock_map_release, - .map_check_btf = map_check_no_btf, -}; - -const struct bpf_map_ops sock_hash_ops = { - .map_alloc = sock_hash_alloc, - .map_free = sock_hash_free, - .map_lookup_elem = sock_map_lookup, - .map_get_next_key = sock_hash_get_next_key, - .map_update_elem = sock_hash_update_elem, - .map_delete_elem = sock_hash_delete_elem, - .map_release_uref = sock_map_release, - .map_check_btf = map_check_no_btf, -}; - -static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops) -{ - return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || - ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; -} -BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, void *, key, u64, flags) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* ULPs are currently supported only for TCP sockets in ESTABLISHED - * state. This checks that the sock ops triggering the update is - * one indicating we are (or will be soon) in an ESTABLISHED state. - */ - if (!bpf_is_valid_sock_op(bpf_sock)) - return -EOPNOTSUPP; - return sock_map_ctx_update_elem(bpf_sock, map, key, flags); -} - -const struct bpf_func_proto bpf_sock_map_update_proto = { - .func = bpf_sock_map_update, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; - -BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, void *, key, u64, flags) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (!bpf_is_valid_sock_op(bpf_sock)) - return -EOPNOTSUPP; - return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); -} - -const struct bpf_func_proto bpf_sock_hash_update_proto = { - .func = bpf_sock_hash_update, - .gpl_only = false, - .pkt_access = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_PTR_TO_MAP_KEY, - .arg4_type = ARG_ANYTHING, -}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 53968f82b919..f4ecd6ed2252 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1664,7 +1664,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: - ret = sockmap_get_from_fd(attr, ptype, prog); + ret = sock_map_get_from_fd(attr, prog); break; case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_attach(attr, prog); @@ -1718,10 +1718,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; case BPF_SK_MSG_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL); + return sock_map_get_from_fd(attr, NULL); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); + return sock_map_get_from_fd(attr, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_FLOW_DISSECTOR: -- cgit v1.2.3 From e45506ac0af9b56b221863e9649fe122d8bb42ff Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Thu, 18 Oct 2018 10:21:33 -0400 Subject: softirq: Fix typo in __do_softirq() comments s/s/as [ mingo: Also add a missing 'the', add proper punctuation and clarify what 'swap' means here. ] Signed-off-by: Yangtao Li Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: alexander.levin@verizon.com Cc: frederic@kernel.org Cc: joel@joelfernandes.org Cc: paulmck@linux.vnet.ibm.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20181018142133.12341-1-tiny.windzz@gmail.com Signed-off-by: Ingo Molnar --- kernel/softirq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 6f584861d329..9526895fe4ac 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -257,9 +257,9 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) int softirq_bit; /* - * Mask out PF_MEMALLOC s current task context is borrowed for the - * softirq. A softirq handled such as network RX might set PF_MEMALLOC - * again if the socket is related to swap + * Mask out PF_MEMALLOC as the current task context is borrowed for the + * softirq. A softirq handled, such as network RX, might set PF_MEMALLOC + * again if the socket is related to swapping. */ current->flags &= ~PF_MEMALLOC; -- cgit v1.2.3 From 21bb9d64c5adc0a87c3736bc9a2d386023b88a5b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Apr 2018 10:46:01 +0200 Subject: swiotlb: remove a pointless comment This comments describes an aspect of the map_sg interface that isn't even exploited by swiotlb. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 4f8a6dbf0b60..9062b14bc7f4 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -925,12 +925,6 @@ swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, * appropriate dma address and length. They are obtained via * sg_dma_{address,length}(SG). * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * * Device ownership issues as mentioned above for swiotlb_map_page are the * same here. */ -- cgit v1.2.3 From b65125c6acf38388d3342b37c18c3b6cc97eeb75 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Apr 2018 14:49:23 +0200 Subject: swiotlb: mark is_swiotlb_buffer static Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9062b14bc7f4..26d3af52956f 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -429,7 +429,7 @@ void __init swiotlb_exit(void) max_segment = 0; } -int is_swiotlb_buffer(phys_addr_t paddr) +static int is_swiotlb_buffer(phys_addr_t paddr) { return paddr >= io_tlb_start && paddr < io_tlb_end; } -- cgit v1.2.3 From 8088546832aa2c0d8f99dd56edf6384f8a9b63b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Apr 2018 10:38:08 +0200 Subject: swiotlb: do not panic on mapping failures All properly written drivers now have error handling in the dma_map_single / dma_map_page callers. As swiotlb_tbl_map_single already prints a useful warning when running out of swiotlb pool space we can also remove swiotlb_full entirely as it serves no purpose now. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/swiotlb.c | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 26d3af52956f..69bf305ee5f8 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -761,34 +761,6 @@ static bool swiotlb_free_buffer(struct device *dev, size_t size, return true; } -static void -swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, - int do_panic) -{ - if (swiotlb_force == SWIOTLB_NO_FORCE) - return; - - /* - * Ran out of IOMMU space for this operation. This is very bad. - * Unfortunately the drivers cannot handle this operation properly. - * unless they check for dma_mapping_error (most don't) - * When the mapping is small enough return a static buffer to limit - * the damage, or panic when the transfer is too big. - */ - dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n", - size); - - if (size <= io_tlb_overflow || !do_panic) - return; - - if (dir == DMA_BIDIRECTIONAL) - panic("DMA: Random memory could be DMA accessed\n"); - if (dir == DMA_FROM_DEVICE) - panic("DMA: Random memory could be DMA written\n"); - if (dir == DMA_TO_DEVICE) - panic("DMA: Random memory could be DMA read\n"); -} - /* * Map a single buffer of the indicated size for DMA in streaming mode. The * physical address to use is returned. @@ -817,10 +789,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, /* Oh well, have to allocate and map a bounce buffer. */ map = map_single(dev, phys, size, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) { - swiotlb_full(dev, size, dir, 1); + if (map == SWIOTLB_MAP_ERROR) return __phys_to_dma(dev, io_tlb_overflow_buffer); - } dev_addr = __phys_to_dma(dev, map); @@ -948,7 +918,6 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, if (map == SWIOTLB_MAP_ERROR) { /* Don't panic here, we expect map_sg users to do proper error handling. */ - swiotlb_full(hwdev, sg->length, dir, 0); attrs |= DMA_ATTR_SKIP_CPU_SYNC; swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, attrs); -- cgit v1.2.3 From dff8d6c1ed584de65aac40494d3e7468c50980c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Aug 2018 15:30:39 +0300 Subject: swiotlb: remove the overflow buffer Like all other dma mapping drivers just return an error code instead of an actual memory buffer. The reason for the overflow buffer was that at the time swiotlb was invented there was no way to check for dma mapping errors, but this has long been fixed. Signed-off-by: Christoph Hellwig Acked-by: Catalin Marinas Reviewed-by: Robin Murphy Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/direct.c | 2 -- kernel/dma/swiotlb.c | 59 +++------------------------------------------------- 2 files changed, 3 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 87a6bc2a96c0..f14c376937e5 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -14,8 +14,6 @@ #include #include -#define DIRECT_MAPPING_ERROR 0 - /* * Most architectures use ZONE_DMA for the first 16 Megabytes, but * some use it for entirely different regions: diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 69bf305ee5f8..11dbcd80b4a6 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -72,13 +72,6 @@ static phys_addr_t io_tlb_start, io_tlb_end; */ static unsigned long io_tlb_nslabs; -/* - * When the IOMMU overflows we return a fallback buffer. This sets the size. - */ -static unsigned long io_tlb_overflow = 32*1024; - -static phys_addr_t io_tlb_overflow_buffer; - /* * This is a free list describing the number of free entries available from * each index @@ -126,7 +119,6 @@ setup_io_tlb_npages(char *str) return 0; } early_param("swiotlb", setup_io_tlb_npages); -/* make io_tlb_overflow tunable too? */ unsigned long swiotlb_nr_tbl(void) { @@ -194,16 +186,10 @@ void __init swiotlb_update_mem_attributes(void) bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); memset(vaddr, 0, bytes); - - vaddr = phys_to_virt(io_tlb_overflow_buffer); - bytes = PAGE_ALIGN(io_tlb_overflow); - set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); - memset(vaddr, 0, bytes); } int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { - void *v_overflow_buffer; unsigned long i, bytes; bytes = nslabs << IO_TLB_SHIFT; @@ -212,17 +198,6 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) io_tlb_start = __pa(tlb); io_tlb_end = io_tlb_start + bytes; - /* - * Get the overflow emergency buffer - */ - v_overflow_buffer = memblock_virt_alloc_low_nopanic( - PAGE_ALIGN(io_tlb_overflow), - PAGE_SIZE); - if (!v_overflow_buffer) - return -ENOMEM; - - io_tlb_overflow_buffer = __pa(v_overflow_buffer); - /* * Allocate and initialize the free list array. This array is used * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE @@ -330,7 +305,6 @@ int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { unsigned long i, bytes; - unsigned char *v_overflow_buffer; bytes = nslabs << IO_TLB_SHIFT; @@ -341,19 +315,6 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); memset(tlb, 0, bytes); - /* - * Get the overflow emergency buffer - */ - v_overflow_buffer = (void *)__get_free_pages(GFP_DMA, - get_order(io_tlb_overflow)); - if (!v_overflow_buffer) - goto cleanup2; - - set_memory_decrypted((unsigned long)v_overflow_buffer, - io_tlb_overflow >> PAGE_SHIFT); - memset(v_overflow_buffer, 0, io_tlb_overflow); - io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); - /* * Allocate and initialize the free list array. This array is used * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE @@ -390,10 +351,6 @@ cleanup4: sizeof(int))); io_tlb_list = NULL; cleanup3: - free_pages((unsigned long)v_overflow_buffer, - get_order(io_tlb_overflow)); - io_tlb_overflow_buffer = 0; -cleanup2: io_tlb_end = 0; io_tlb_start = 0; io_tlb_nslabs = 0; @@ -407,8 +364,6 @@ void __init swiotlb_exit(void) return; if (late_alloc) { - free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer), - get_order(io_tlb_overflow)); free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * sizeof(phys_addr_t))); free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * @@ -416,8 +371,6 @@ void __init swiotlb_exit(void) free_pages((unsigned long)phys_to_virt(io_tlb_start), get_order(io_tlb_nslabs << IO_TLB_SHIFT)); } else { - memblock_free_late(io_tlb_overflow_buffer, - PAGE_ALIGN(io_tlb_overflow)); memblock_free_late(__pa(io_tlb_orig_addr), PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); memblock_free_late(__pa(io_tlb_list), @@ -790,7 +743,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, /* Oh well, have to allocate and map a bounce buffer. */ map = map_single(dev, phys, size, dir, attrs); if (map == SWIOTLB_MAP_ERROR) - return __phys_to_dma(dev, io_tlb_overflow_buffer); + return DIRECT_MAPPING_ERROR; dev_addr = __phys_to_dma(dev, map); @@ -801,7 +754,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, attrs |= DMA_ATTR_SKIP_CPU_SYNC; swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); - return __phys_to_dma(dev, io_tlb_overflow_buffer); + return DIRECT_MAPPING_ERROR; } /* @@ -985,12 +938,6 @@ swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); } -int -swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) -{ - return (dma_addr == __phys_to_dma(hwdev, io_tlb_overflow_buffer)); -} - /* * Return whether the given device DMA address mask can be supported * properly. For example, if your device can only drive the low 24-bits @@ -1033,7 +980,7 @@ void swiotlb_free(struct device *dev, size_t size, void *vaddr, } const struct dma_map_ops swiotlb_dma_ops = { - .mapping_error = swiotlb_dma_mapping_error, + .mapping_error = dma_direct_mapping_error, .alloc = swiotlb_alloc, .free = swiotlb_free, .sync_single_for_cpu = swiotlb_sync_single_for_cpu, -- cgit v1.2.3 From 27744e0077f4c8b40aaa3126256708c21e56655d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Apr 2018 09:56:56 +0200 Subject: swiotlb: merge swiotlb_unmap_page and unmap_single Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 11dbcd80b4a6..15335f3a1bf3 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -765,9 +765,9 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, * After this call, reads by the cpu to the buffer are guaranteed to see * whatever the device wrote there. */ -static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) +void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) { phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); @@ -790,13 +790,6 @@ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, dma_mark_clean(phys_to_virt(paddr), size); } -void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - unmap_single(hwdev, dev_addr, size, dir, attrs); -} - /* * Make physical memory consistent for a single streaming mode DMA translation * after a transfer. @@ -900,7 +893,7 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) - unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, + swiotlb_unmap_page(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs); } -- cgit v1.2.3 From 4803b44e68fc08e76f00dec90074d199a11ad6f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Aug 2018 15:56:05 +0200 Subject: swiotlb: use swiotlb_map_page in swiotlb_map_sg_attrs No need to duplicate the code - map_sg is equivalent to map_page for each page in the scatterlist. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 15335f3a1bf3..15755d7a5242 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -845,37 +845,27 @@ swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, * same here. */ int -swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, +swiotlb_map_sg_attrs(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; - BUG_ON(dir == DMA_NONE); - for_each_sg(sgl, sg, nelems, i) { - phys_addr_t paddr = sg_phys(sg); - dma_addr_t dev_addr = phys_to_dma(hwdev, paddr); - - if (swiotlb_force == SWIOTLB_FORCE || - !dma_capable(hwdev, dev_addr, sg->length)) { - phys_addr_t map = map_single(hwdev, sg_phys(sg), - sg->length, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) { - /* Don't panic here, we expect map_sg users - to do proper error handling. */ - attrs |= DMA_ATTR_SKIP_CPU_SYNC; - swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, - attrs); - sg_dma_len(sgl) = 0; - return 0; - } - sg->dma_address = __phys_to_dma(hwdev, map); - } else - sg->dma_address = dev_addr; + sg->dma_address = swiotlb_map_page(dev, sg_page(sg), sg->offset, + sg->length, dir, attrs); + if (sg->dma_address == DIRECT_MAPPING_ERROR) + goto out_error; sg_dma_len(sg) = sg->length; } + return nelems; + +out_error: + swiotlb_unmap_sg_attrs(dev, sgl, i, dir, + attrs | DMA_ATTR_SKIP_CPU_SYNC); + sg_dma_len(sgl) = 0; + return 0; } /* -- cgit v1.2.3 From c4dae366925f929749b2a26efa53b561904a9a4f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Aug 2018 16:21:10 +0200 Subject: swiotlb: refactor swiotlb_map_page Remove the somewhat useless map_single function, and replace it with a swiotlb_bounce_page handler that handles everything related to actually bouncing a page. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 67 +++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 15755d7a5242..57507b18caa4 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -543,26 +543,6 @@ found: return tlb_addr; } -/* - * Allocates bounce buffer and returns its physical address. - */ -static phys_addr_t -map_single(struct device *hwdev, phys_addr_t phys, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - dma_addr_t start_dma_addr; - - if (swiotlb_force == SWIOTLB_NO_FORCE) { - dev_warn_ratelimited(hwdev, "Cannot do DMA to address %pa\n", - &phys); - return SWIOTLB_MAP_ERROR; - } - - start_dma_addr = __phys_to_dma(hwdev, io_tlb_start); - return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, - dir, attrs); -} - /* * tlb_addr is the physical address of the bounce buffer to unmap. */ @@ -714,6 +694,34 @@ static bool swiotlb_free_buffer(struct device *dev, size_t size, return true; } +static dma_addr_t swiotlb_bounce_page(struct device *dev, phys_addr_t *phys, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + dma_addr_t dma_addr; + + if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) { + dev_warn_ratelimited(dev, + "Cannot do DMA to address %pa\n", phys); + return DIRECT_MAPPING_ERROR; + } + + /* Oh well, have to allocate and map a bounce buffer. */ + *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), + *phys, size, dir, attrs); + if (*phys == SWIOTLB_MAP_ERROR) + return DIRECT_MAPPING_ERROR; + + /* Ensure that the address returned is DMA'ble */ + dma_addr = __phys_to_dma(dev, *phys); + if (unlikely(!dma_capable(dev, dma_addr, size))) { + swiotlb_tbl_unmap_single(dev, *phys, size, dir, + attrs | DMA_ATTR_SKIP_CPU_SYNC); + return DIRECT_MAPPING_ERROR; + } + + return dma_addr; +} + /* * Map a single buffer of the indicated size for DMA in streaming mode. The * physical address to use is returned. @@ -726,7 +734,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t map, phys = page_to_phys(page) + offset; + phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dev_addr = phys_to_dma(dev, phys); BUG_ON(dir == DMA_NONE); @@ -739,22 +747,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, return dev_addr; trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); - - /* Oh well, have to allocate and map a bounce buffer. */ - map = map_single(dev, phys, size, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) - return DIRECT_MAPPING_ERROR; - - dev_addr = __phys_to_dma(dev, map); - - /* Ensure that the address returned is DMA'ble */ - if (dma_capable(dev, dev_addr, size)) - return dev_addr; - - attrs |= DMA_ATTR_SKIP_CPU_SYNC; - swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); - - return DIRECT_MAPPING_ERROR; + return swiotlb_bounce_page(dev, &phys, size, dir, attrs); } /* -- cgit v1.2.3 From fafadcd16595c1df82df399f62421718ec9bf70a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 30 Sep 2018 16:13:33 -0700 Subject: swiotlb: don't dip into swiotlb pool for coherent allocations All architectures that support swiotlb also have a zone that backs up these less than full addressing allocations (usually ZONE_DMA32). Because of that it is rather pointless to fall back to the global swiotlb buffer if the normal dma direct allocation failed - the only thing this will do is to eat up bounce buffers that would be more useful to serve streaming mappings. Signed-off-by: Christoph Hellwig Acked-by: Catalin Marinas Acked-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 105 +-------------------------------------------------- 1 file changed, 2 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 57507b18caa4..1a01b0ac0a5e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -622,78 +622,6 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, } } -static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr, - size_t size) -{ - u64 mask = DMA_BIT_MASK(32); - - if (dev && dev->coherent_dma_mask) - mask = dev->coherent_dma_mask; - return addr + size - 1 <= mask; -} - -static void * -swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle, - unsigned long attrs) -{ - phys_addr_t phys_addr; - - if (swiotlb_force == SWIOTLB_NO_FORCE) - goto out_warn; - - phys_addr = swiotlb_tbl_map_single(dev, - __phys_to_dma(dev, io_tlb_start), - 0, size, DMA_FROM_DEVICE, attrs); - if (phys_addr == SWIOTLB_MAP_ERROR) - goto out_warn; - - *dma_handle = __phys_to_dma(dev, phys_addr); - if (!dma_coherent_ok(dev, *dma_handle, size)) - goto out_unmap; - - memset(phys_to_virt(phys_addr), 0, size); - return phys_to_virt(phys_addr); - -out_unmap: - dev_warn(dev, "hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", - (unsigned long long)dev->coherent_dma_mask, - (unsigned long long)*dma_handle); - - /* - * DMA_TO_DEVICE to avoid memcpy in unmap_single. - * DMA_ATTR_SKIP_CPU_SYNC is optional. - */ - swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); -out_warn: - if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) { - dev_warn(dev, - "swiotlb: coherent allocation failed, size=%zu\n", - size); - dump_stack(); - } - return NULL; -} - -static bool swiotlb_free_buffer(struct device *dev, size_t size, - dma_addr_t dma_addr) -{ - phys_addr_t phys_addr = dma_to_phys(dev, dma_addr); - - WARN_ON_ONCE(irqs_disabled()); - - if (!is_swiotlb_buffer(phys_addr)) - return false; - - /* - * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single. - * DMA_ATTR_SKIP_CPU_SYNC is optional. - */ - swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); - return true; -} - static dma_addr_t swiotlb_bounce_page(struct device *dev, phys_addr_t *phys, size_t size, enum dma_data_direction dir, unsigned long attrs) { @@ -926,39 +854,10 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask) return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; } -void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - void *vaddr; - - /* temporary workaround: */ - if (gfp & __GFP_NOWARN) - attrs |= DMA_ATTR_NO_WARN; - - /* - * Don't print a warning when the first allocation attempt fails. - * swiotlb_alloc_coherent() will print a warning when the DMA memory - * allocation ultimately failed. - */ - gfp |= __GFP_NOWARN; - - vaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs); - if (!vaddr) - vaddr = swiotlb_alloc_buffer(dev, size, dma_handle, attrs); - return vaddr; -} - -void swiotlb_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, unsigned long attrs) -{ - if (!swiotlb_free_buffer(dev, size, dma_addr)) - dma_direct_free(dev, size, vaddr, dma_addr, attrs); -} - const struct dma_map_ops swiotlb_dma_ops = { .mapping_error = dma_direct_mapping_error, - .alloc = swiotlb_alloc, - .free = swiotlb_free, + .alloc = dma_direct_alloc, + .free = dma_direct_free, .sync_single_for_cpu = swiotlb_sync_single_for_cpu, .sync_single_for_device = swiotlb_sync_single_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, -- cgit v1.2.3 From a4a4330db46a17289cf2ca5f9fb153d536267b97 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Oct 2018 08:51:53 +0200 Subject: swiotlb: add support for non-coherent DMA Handle architectures that are not cache coherent directly in the main swiotlb code by calling arch_sync_dma_for_{device,cpu} in all the right places from the various dma_map/unmap/sync methods when the device is non-coherent. Because swiotlb now uses dma_direct_alloc for the coherent allocation that side is already taken care of by the dma-direct code calling into arch_dma_{alloc,free} for devices that are non-coherent. Signed-off-by: Christoph Hellwig Acked-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 1a01b0ac0a5e..ebecaf255ea2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -671,11 +672,17 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (dma_capable(dev, dev_addr, size) && swiotlb_force != SWIOTLB_FORCE) - return dev_addr; + if (!dma_capable(dev, dev_addr, size) || + swiotlb_force == SWIOTLB_FORCE) { + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + dev_addr = swiotlb_bounce_page(dev, &phys, size, dir, attrs); + } + + if (!dev_is_dma_coherent(dev) && + (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) + arch_sync_dma_for_device(dev, phys, size, dir); - trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); - return swiotlb_bounce_page(dev, &phys, size, dir, attrs); + return dev_addr; } /* @@ -694,6 +701,10 @@ void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); + if (!dev_is_dma_coherent(hwdev) && + (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) + arch_sync_dma_for_cpu(hwdev, paddr, size, dir); + if (is_swiotlb_buffer(paddr)) { swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); return; @@ -730,15 +741,17 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(paddr)) { + if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_CPU) + arch_sync_dma_for_cpu(hwdev, paddr, size, dir); + + if (is_swiotlb_buffer(paddr)) swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); - return; - } - if (dir != DMA_FROM_DEVICE) - return; + if (!dev_is_dma_coherent(hwdev) && target == SYNC_FOR_DEVICE) + arch_sync_dma_for_device(hwdev, paddr, size, dir); - dma_mark_clean(phys_to_virt(paddr), size); + if (!is_swiotlb_buffer(paddr) && dir == DMA_FROM_DEVICE) + dma_mark_clean(phys_to_virt(paddr), size); } void -- cgit v1.2.3 From 746a923b863a1065ef77324e1e43f19b1a3eab5c Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 18 Oct 2018 15:15:05 +0200 Subject: genirq: Fix race on spurious interrupt detection Commit 1e77d0a1ed74 ("genirq: Sanitize spurious interrupt detection of threaded irqs") made detection of spurious interrupts work for threaded handlers by: a) incrementing a counter every time the thread returns IRQ_HANDLED, and b) checking whether that counter has increased every time the thread is woken. However for oneshot interrupts, the commit unmasks the interrupt before incrementing the counter. If another interrupt occurs right after unmasking but before the counter is incremented, that interrupt is incorrectly considered spurious: time | irq_thread() | irq_thread_fn() | action->thread_fn() | irq_finalize_oneshot() | unmask_threaded_irq() /* interrupt is unmasked */ | | /* interrupt fires, incorrectly deemed spurious */ | | atomic_inc(&desc->threads_handled); /* counter is incremented */ v This is observed with a hi3110 CAN controller receiving data at high volume (from a separate machine sending with "cangen -g 0 -i -x"): The controller signals a huge number of interrupts (hundreds of millions per day) and every second there are about a dozen which are deemed spurious. In theory with high CPU load and the presence of higher priority tasks, the number of incorrectly detected spurious interrupts might increase beyond the 99,900 threshold and cause disablement of the interrupt. In practice it just increments the spurious interrupt count. But that can cause people to waste time investigating it over and over. Fix it by moving the accounting before the invocation of irq_finalize_oneshot(). [ tglx: Folded change log update ] Fixes: 1e77d0a1ed74 ("genirq: Sanitize spurious interrupt detection of threaded irqs") Signed-off-by: Lukas Wunner Signed-off-by: Thomas Gleixner Cc: Mathias Duckeck Cc: Akshay Bhat Cc: Casey Fitzpatrick Cc: stable@vger.kernel.org # v3.16+ Link: https://lkml.kernel.org/r/1dfd8bbd16163940648045495e3e9698e63b50ad.1539867047.git.lukas@wunner.de --- kernel/irq/manage.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fb86146037a7..9dbdccab3b6a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -927,6 +927,9 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) local_bh_disable(); ret = action->thread_fn(action->irq, action->dev_id); + if (ret == IRQ_HANDLED) + atomic_inc(&desc->threads_handled); + irq_finalize_oneshot(desc, action); local_bh_enable(); return ret; @@ -943,6 +946,9 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, irqreturn_t ret; ret = action->thread_fn(action->irq, action->dev_id); + if (ret == IRQ_HANDLED) + atomic_inc(&desc->threads_handled); + irq_finalize_oneshot(desc, action); return ret; } @@ -1020,8 +1026,6 @@ static int irq_thread(void *data) irq_thread_check_affinity(desc, action); action_ret = handler_fn(desc, action); - if (action_ret == IRQ_HANDLED) - atomic_inc(&desc->threads_handled); if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); -- cgit v1.2.3 From 144991602e6a14d667b295f1b099e609ce857772 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:09 +0200 Subject: bpf: rename stack trace map operations In the following patches queue and stack maps (FIFO and LIFO datastructures) will be implemented. In order to avoid confusion and a possible name clash rename stack_map_ops to stack_trace_map_ops Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/stackmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index b2ade10f7ec3..90daf285de03 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -600,7 +600,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } -const struct bpf_map_ops stack_map_ops = { +const struct bpf_map_ops stack_trace_map_ops = { .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, -- cgit v1.2.3 From c9d29f4658a5a6d2c2ba2afeb20ff763fc6286f9 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:14 +0200 Subject: bpf/syscall: allow key to be null in map functions This commit adds the required logic to allow key being NULL in case the key_size of the map is 0. A new __bpf_copy_key function helper only copies the key from userpsace when key_size != 0, otherwise it enforces that key must be null. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f4ecd6ed2252..78d9dd95e25f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -651,6 +651,17 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) return -ENOTSUPP; } +static void *__bpf_copy_key(void __user *ukey, u64 key_size) +{ + if (key_size) + return memdup_user(ukey, key_size); + + if (ukey) + return ERR_PTR(-EINVAL); + + return NULL; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value @@ -678,7 +689,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto err_put; } - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -785,7 +796,7 @@ static int map_update_elem(union bpf_attr *attr) goto err_put; } - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -888,7 +899,7 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; @@ -941,7 +952,7 @@ static int map_get_next_key(union bpf_attr *attr) } if (ukey) { - key = memdup_user(ukey, map->key_size); + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; -- cgit v1.2.3 From 2ea864c58f19bf70a0e2415f9f1c53814e07f1b4 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:20 +0200 Subject: bpf/verifier: add ARG_PTR_TO_UNINIT_MAP_VALUE ARG_PTR_TO_UNINIT_MAP_VALUE argument is a pointer to a memory zone used to save the value of a map. Basically the same as ARG_PTR_TO_UNINIT_MEM, but the size has not be passed as an extra argument. This will be used in the following patch that implements some new helpers that receive a pointer to be filled with a map value. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3f93a548a642..d84c91ac3b70 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2117,7 +2117,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (arg_type == ARG_PTR_TO_MAP_KEY || - arg_type == ARG_PTR_TO_MAP_VALUE) { + arg_type == ARG_PTR_TO_MAP_VALUE || + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { expected_type = PTR_TO_STACK; if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && type != expected_type) @@ -2187,7 +2188,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { + } else if (arg_type == ARG_PTR_TO_MAP_VALUE || + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ @@ -2196,9 +2198,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } + meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE); err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, - NULL); + meta); } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); -- cgit v1.2.3 From f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:25 +0200 Subject: bpf: add queue and stack maps Queue/stack maps implement a FIFO/LIFO data storage for ebpf programs. These maps support peek, pop and push operations that are exposed to eBPF programs through the new bpf_map[peek/pop/push] helpers. Those operations are exposed to userspace applications through the already existing syscalls in the following way: BPF_MAP_LOOKUP_ELEM -> peek BPF_MAP_LOOKUP_AND_DELETE_ELEM -> pop BPF_MAP_UPDATE_ELEM -> push Queue/stack maps are implemented using a buffer, tail and head indexes, hence BPF_F_NO_PREALLOC is not supported. As opposite to other maps, queue and stack do not use RCU for protecting maps values, the bpf_map[peek/pop] have a ARG_PTR_TO_UNINIT_MAP_VALUE argument that is a pointer to a memory zone where to save the value of a map. Basically the same as ARG_PTR_TO_UNINIT_MEM, but the size has not be passed as an extra argument. Our main motivation for implementing queue/stack maps was to keep track of a pool of elements, like network ports in a SNAT, however we forsee other use cases, like for exampling saving last N kernel events in a map and then analysing from userspace. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 2 +- kernel/bpf/core.c | 3 + kernel/bpf/helpers.c | 43 +++++++ kernel/bpf/queue_stack_maps.c | 288 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 + kernel/bpf/verifier.c | 19 ++- 6 files changed, 359 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/queue_stack_maps.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index ff8262626b8f..4c2fa3ac56f6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,7 +3,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index defcf4df6d91..7c7eeea8cffc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1783,6 +1783,9 @@ BPF_CALL_0(bpf_user_rnd_u32) const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; +const struct bpf_func_proto bpf_map_push_elem_proto __weak; +const struct bpf_func_proto bpf_map_pop_elem_proto __weak; +const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6502115e8f55..ab0d5e3f9892 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -76,6 +76,49 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { .arg2_type = ARG_PTR_TO_MAP_KEY, }; +BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) +{ + return map->ops->map_push_elem(map, value, flags); +} + +const struct bpf_func_proto bpf_map_push_elem_proto = { + .func = bpf_map_push_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_VALUE, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) +{ + return map->ops->map_pop_elem(map, value); +} + +const struct bpf_func_proto bpf_map_pop_elem_proto = { + .func = bpf_map_pop_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +}; + +BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) +{ + return map->ops->map_peek_elem(map, value); +} + +const struct bpf_func_proto bpf_map_peek_elem_proto = { + .func = bpf_map_pop_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +}; + const struct bpf_func_proto bpf_get_prandom_u32_proto = { .func = bpf_user_rnd_u32, .gpl_only = false, diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c new file mode 100644 index 000000000000..12a93fb37449 --- /dev/null +++ b/kernel/bpf/queue_stack_maps.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * queue_stack_maps.c: BPF queue and stack maps + * + * Copyright (c) 2018 Politecnico di Torino + */ +#include +#include +#include +#include "percpu_freelist.h" + +#define QUEUE_STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + + +struct bpf_queue_stack { + struct bpf_map map; + raw_spinlock_t lock; + u32 head, tail; + u32 size; /* max_entries + 1 */ + + char elements[0] __aligned(8); +}; + +static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) +{ + return container_of(map, struct bpf_queue_stack, map); +} + +static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs) +{ + return qs->head == qs->tail; +} + +static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) +{ + u32 head = qs->head + 1; + + if (unlikely(head >= qs->size)) + head = 0; + + return head == qs->tail; +} + +/* Called from syscall */ +static int queue_stack_map_alloc_check(union bpf_attr *attr) +{ + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 0 || + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) + return -EINVAL; + + if (attr->value_size > KMALLOC_MAX_SIZE) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return -E2BIG; + + return 0; +} + +static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) +{ + int ret, numa_node = bpf_map_attr_numa_node(attr); + struct bpf_queue_stack *qs; + u32 size, value_size; + u64 queue_size, cost; + + size = attr->max_entries + 1; + value_size = attr->value_size; + + queue_size = sizeof(*qs) + (u64) value_size * size; + + cost = queue_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(cost); + if (ret < 0) + return ERR_PTR(ret); + + qs = bpf_map_area_alloc(queue_size, numa_node); + if (!qs) + return ERR_PTR(-ENOMEM); + + memset(qs, 0, sizeof(*qs)); + + bpf_map_init_from_attr(&qs->map, attr); + + qs->map.pages = cost; + qs->size = size; + + raw_spin_lock_init(&qs->lock); + + return &qs->map; +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void queue_stack_map_free(struct bpf_map *map) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + bpf_map_area_free(qs); +} + +static int __queue_map_get(struct bpf_map *map, void *value, bool delete) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long flags; + int err = 0; + void *ptr; + + raw_spin_lock_irqsave(&qs->lock, flags); + + if (queue_stack_map_is_empty(qs)) { + err = -ENOENT; + goto out; + } + + ptr = &qs->elements[qs->tail * qs->map.value_size]; + memcpy(value, ptr, qs->map.value_size); + + if (delete) { + if (unlikely(++qs->tail >= qs->size)) + qs->tail = 0; + } + +out: + raw_spin_unlock_irqrestore(&qs->lock, flags); + return err; +} + + +static int __stack_map_get(struct bpf_map *map, void *value, bool delete) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long flags; + int err = 0; + void *ptr; + u32 index; + + raw_spin_lock_irqsave(&qs->lock, flags); + + if (queue_stack_map_is_empty(qs)) { + err = -ENOENT; + goto out; + } + + index = qs->head - 1; + if (unlikely(index >= qs->size)) + index = qs->size - 1; + + ptr = &qs->elements[index * qs->map.value_size]; + memcpy(value, ptr, qs->map.value_size); + + if (delete) + qs->head = index; + +out: + raw_spin_unlock_irqrestore(&qs->lock, flags); + return err; +} + +/* Called from syscall or from eBPF program */ +static int queue_map_peek_elem(struct bpf_map *map, void *value) +{ + return __queue_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_peek_elem(struct bpf_map *map, void *value) +{ + return __stack_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int queue_map_pop_elem(struct bpf_map *map, void *value) +{ + return __queue_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_pop_elem(struct bpf_map *map, void *value) +{ + return __stack_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_push_elem(struct bpf_map *map, void *value, + u64 flags) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long irq_flags; + int err = 0; + void *dst; + + /* BPF_EXIST is used to force making room for a new element in case the + * map is full + */ + bool replace = (flags & BPF_EXIST); + + /* Check supported flags for queue and stack maps */ + if (flags & BPF_NOEXIST || flags > BPF_EXIST) + return -EINVAL; + + raw_spin_lock_irqsave(&qs->lock, irq_flags); + + if (queue_stack_map_is_full(qs)) { + if (!replace) { + err = -E2BIG; + goto out; + } + /* advance tail pointer to overwrite oldest element */ + if (unlikely(++qs->tail >= qs->size)) + qs->tail = 0; + } + + dst = &qs->elements[qs->head * qs->map.value_size]; + memcpy(dst, value, qs->map.value_size); + + if (unlikely(++qs->head >= qs->size)) + qs->head = 0; + +out: + raw_spin_unlock_irqrestore(&qs->lock, irq_flags); + return err; +} + +/* Called from syscall or from eBPF program */ +static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +/* Called from syscall */ +static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -EINVAL; +} + +const struct bpf_map_ops queue_map_ops = { + .map_alloc_check = queue_stack_map_alloc_check, + .map_alloc = queue_stack_map_alloc, + .map_free = queue_stack_map_free, + .map_lookup_elem = queue_stack_map_lookup_elem, + .map_update_elem = queue_stack_map_update_elem, + .map_delete_elem = queue_stack_map_delete_elem, + .map_push_elem = queue_stack_map_push_elem, + .map_pop_elem = queue_map_pop_elem, + .map_peek_elem = queue_map_peek_elem, + .map_get_next_key = queue_stack_map_get_next_key, +}; + +const struct bpf_map_ops stack_map_ops = { + .map_alloc_check = queue_stack_map_alloc_check, + .map_alloc = queue_stack_map_alloc, + .map_free = queue_stack_map_free, + .map_lookup_elem = queue_stack_map_lookup_elem, + .map_update_elem = queue_stack_map_update_elem, + .map_delete_elem = queue_stack_map_delete_elem, + .map_push_elem = queue_stack_map_push_elem, + .map_pop_elem = stack_map_pop_elem, + .map_peek_elem = stack_map_peek_elem, + .map_get_next_key = queue_stack_map_get_next_key, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 78d9dd95e25f..1617407f9ee5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -727,6 +727,9 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_htab_map_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { err = bpf_fd_reuseport_array_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_peek_elem(map, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -857,6 +860,9 @@ static int map_update_elem(union bpf_attr *attr) /* rcu_read_lock() is not needed */ err = bpf_fd_reuseport_array_update_elem(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_push_elem(map, value, attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d84c91ac3b70..7d6d9cf9ebd5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2324,6 +2324,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_sk_select_reuseport) goto error; break; + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + if (func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_map_pop_elem && + func_id != BPF_FUNC_map_push_elem) + goto error; + break; default: break; } @@ -2380,6 +2387,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) goto error; break; + case BPF_FUNC_map_peek_elem: + case BPF_FUNC_map_pop_elem: + case BPF_FUNC_map_push_elem: + if (map->map_type != BPF_MAP_TYPE_QUEUE && + map->map_type != BPF_MAP_TYPE_STACK) + goto error; + break; default: break; } @@ -2675,7 +2689,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && func_id != BPF_FUNC_map_update_elem && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_map_push_elem && + func_id != BPF_FUNC_map_pop_elem && + func_id != BPF_FUNC_map_peek_elem) return 0; if (meta->map_ptr == NULL) { -- cgit v1.2.3 From bd513cd08f10cbe28856f99ae951e86e86803861 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:30 +0200 Subject: bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall The previous patch implemented a bpf queue/stack maps that provided the peek/pop/push functions. There is not a direct relationship between those functions and the current maps syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added, this is mapped to the pop operation in the queue/stack maps and it is still to implement in other kind of maps. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1617407f9ee5..49ae64a26562 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -999,6 +999,69 @@ err_put: return err; } +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value + +static int map_lookup_and_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); + int ufd = attr->map_fd; + struct bpf_map *map; + void *key, *value, *ptr; + u32 value_size; + struct fd f; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + + key = __bpf_copy_key(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); + goto err_put; + } + + value_size = map->value_size; + + err = -ENOMEM; + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + if (!value) + goto free_key; + + if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_pop_elem(map, value); + } else { + err = -ENOTSUPP; + } + + if (err) + goto free_value; + + if (copy_to_user(uvalue, value, value_size) != 0) + goto free_value; + + err = 0; + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name) \ [_id] = & _name ## _prog_ops, @@ -2472,6 +2535,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_TASK_FD_QUERY: err = bpf_task_fd_query(&attr, uattr); break; + case BPF_MAP_LOOKUP_AND_DELETE_ELEM: + err = map_lookup_and_delete_elem(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From b39b5f411dcfce28ff954e5d6acb2c11be3cb0ec Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 19 Oct 2018 09:57:57 -0700 Subject: bpf: add cg_skb_is_valid_access for BPF_PROG_TYPE_CGROUP_SKB BPF programs of BPF_PROG_TYPE_CGROUP_SKB need to access headers in the skb. This patch enables direct access of skb for these programs. Two helper functions bpf_compute_and_save_data_end() and bpf_restore_data_end() are introduced. There are used in __cgroup_bpf_run_filter_skb(), to compute proper data_end for the BPF program, and restore original data afterwards. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 00f6ed2e4f9a..9425c2fb872f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -553,6 +553,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, { unsigned int offset = skb->data - skb_network_header(skb); struct sock *save_sk; + void *saved_data_end; struct cgroup *cgrp; int ret; @@ -566,8 +567,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, save_sk = skb->sk; skb->sk = sk; __skb_push(skb, offset); + + /* compute pointers for the bpf prog */ + bpf_compute_and_save_data_end(skb, &saved_data_end); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, bpf_prog_run_save_cb); + bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; return ret == 1 ? 0 : -EPERM; -- cgit v1.2.3 From 540fefc08f75aedb517acbf525d393b8efddabd9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 19 Oct 2018 13:52:38 -0700 Subject: bpf: remove unused variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix the following warning ../kernel/bpf/syscall.c: In function ‘map_lookup_and_delete_elem’: ../kernel/bpf/syscall.c:1010:22: warning: unused variable ‘ptr’ [-Wunused-variable] void *key, *value, *ptr; ^~~ Fixes: bd513cd08f10 ("bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall") Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 49ae64a26562..ccb93277aae2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1007,7 +1007,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; - void *key, *value, *ptr; + void *key, *value; u32 value_size; struct fd f; int err; -- cgit v1.2.3 From 2a159c6f82381a458bc56e7e202b6bee57a2ccb7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 21 Oct 2018 02:09:24 +0200 Subject: bpf, verifier: fix register type dump in xadd and st Using reg_type_str[insn->dst_reg] is incorrect since insn->dst_reg contains the register number but not the actual register type. Add a small reg_state() helper and use it to get to the type. Also fix up the test_verifier test cases that have an incorrect errstr. Fixes: 9d2be44a7f33 ("bpf: Reuse canonical string formatter for ctx errs") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7d6d9cf9ebd5..64e0981a4074 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1528,14 +1528,19 @@ static bool __is_pointer_value(bool allow_ptr_leaks, return reg->type != SCALAR_VALUE; } +static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) +{ + return cur_regs(env) + regno; +} + static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { - return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); + return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno)); } static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) { - const struct bpf_reg_state *reg = cur_regs(env) + regno; + const struct bpf_reg_state *reg = reg_state(env, regno); return reg->type == PTR_TO_CTX || reg->type == PTR_TO_SOCKET; @@ -1543,7 +1548,7 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) { - const struct bpf_reg_state *reg = cur_regs(env) + regno; + const struct bpf_reg_state *reg = reg_state(env, regno); return type_is_pkt_pointer(reg->type); } @@ -1958,7 +1963,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", - insn->dst_reg, reg_type_str[insn->dst_reg]); + insn->dst_reg, + reg_type_str[reg_state(env, insn->dst_reg)->type]); return -EACCES; } @@ -1983,7 +1989,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = cur_regs(env) + regno; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); int off, i, slot, spi; @@ -5264,7 +5270,8 @@ static int do_check(struct bpf_verifier_env *env) if (is_ctx_reg(env, insn->dst_reg)) { verbose(env, "BPF_ST stores into R%d %s is not allowed\n", - insn->dst_reg, reg_type_str[insn->dst_reg]); + insn->dst_reg, + reg_type_str[reg_state(env, insn->dst_reg)->type]); return -EACCES; } -- cgit v1.2.3 From 4b5defdec398491c5b301a6255cdf468eedfb228 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 21 Oct 2018 02:09:25 +0200 Subject: bpf, verifier: reject xadd on flow key memory We should not enable xadd operation for flow key memory if not needed there anyway. There is no such issue as described in the commit f37a8cb84cce ("bpf: reject stores into ctx via st and xadd") since there's no context rewriter for flow keys today, but it also shouldn't become part of the user facing behavior to allow for it. After patch: 0: (79) r7 = *(u64 *)(r1 +144) 1: (b7) r3 = 4096 2: (db) lock *(u64 *)(r7 +0) += r3 BPF_XADD stores into R7 flow_keys is not allowed Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 64e0981a4074..0450ffcc3de4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1553,6 +1553,14 @@ static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) return type_is_pkt_pointer(reg->type); } +static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = reg_state(env, regno); + + /* Separate to is_ctx_reg() since we still want to allow BPF_ST here. */ + return reg->type == PTR_TO_FLOW_KEYS; +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -1961,7 +1969,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins } if (is_ctx_reg(env, insn->dst_reg) || - is_pkt_reg(env, insn->dst_reg)) { + is_pkt_reg(env, insn->dst_reg) || + is_flow_key_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); -- cgit v1.2.3 From ad38911dcdb6978a415db65b3e00e0f3fcd9edfc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 21 Oct 2018 02:09:26 +0200 Subject: bpf, verifier: remove unneeded flow key in check_helper_mem_access They PTR_TO_FLOW_KEYS is not used today to be passed into a helper as memory, so it can be removed from check_helper_mem_access(). Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0450ffcc3de4..4f727c9eb45c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2077,8 +2077,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); - case PTR_TO_FLOW_KEYS: - return check_flow_keys_access(env, reg->off, access_size); case PTR_TO_MAP_VALUE: return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); -- cgit v1.2.3 From 84430d4232c36cb858564374f839d233fd6f623a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 21 Oct 2018 02:09:27 +0200 Subject: bpf, verifier: avoid retpoline for map push/pop/peek operation Extend prior work from 09772d92cd5a ("bpf: avoid retpoline for lookup/update/delete calls on maps") to also apply to the recently added map helpers that perform push/pop/peek operations so that the indirect call can be avoided. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4f727c9eb45c..98fa0be35370 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6178,7 +6178,10 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (prog->jit_requested && BITS_PER_LONG == 64 && (insn->imm == BPF_FUNC_map_lookup_elem || insn->imm == BPF_FUNC_map_update_elem || - insn->imm == BPF_FUNC_map_delete_elem)) { + insn->imm == BPF_FUNC_map_delete_elem || + insn->imm == BPF_FUNC_map_push_elem || + insn->imm == BPF_FUNC_map_pop_elem || + insn->imm == BPF_FUNC_map_peek_elem)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -6211,6 +6214,14 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) BUILD_BUG_ON(!__same_type(ops->map_update_elem, (int (*)(struct bpf_map *map, void *key, void *value, u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_push_elem, + (int (*)(struct bpf_map *map, void *value, + u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_pop_elem, + (int (*)(struct bpf_map *map, void *value))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_peek_elem, + (int (*)(struct bpf_map *map, void *value))NULL)); + switch (insn->imm) { case BPF_FUNC_map_lookup_elem: insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - @@ -6224,6 +6235,18 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->imm = BPF_CAST_CALL(ops->map_delete_elem) - __bpf_call_base; continue; + case BPF_FUNC_map_push_elem: + insn->imm = BPF_CAST_CALL(ops->map_push_elem) - + __bpf_call_base; + continue; + case BPF_FUNC_map_pop_elem: + insn->imm = BPF_CAST_CALL(ops->map_pop_elem) - + __bpf_call_base; + continue; + case BPF_FUNC_map_peek_elem: + insn->imm = BPF_CAST_CALL(ops->map_peek_elem) - + __bpf_call_base; + continue; } goto patch_call_imm; -- cgit v1.2.3 From 876dcf2f3aaa0f68d437b368b93a4c4b81521191 Mon Sep 17 00:00:00 2001 From: Olivier Brunel Date: Sat, 20 Oct 2018 19:39:56 +0200 Subject: umh: Add command line to user mode helpers User mode helpers were spawned without a command line, and because an empty command line is used by many tools to identify processes as kernel threads, this could cause some issues. Notably during killing spree on shutdown, since such helper would then be skipped (i.e. not killed) which would result in the process remaining alive, and thus preventing unmouting of the rootfs (as experienced with the bpfilter umh). Fixes: 449325b52b7a ("umh: introduce fork_usermode_blob() helper") Signed-off-by: Olivier Brunel Signed-off-by: David S. Miller --- kernel/umh.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index c449858946af..0baa672e023c 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -405,11 +405,19 @@ struct subprocess_info *call_usermodehelper_setup_file(struct file *file, void (*cleanup)(struct subprocess_info *info), void *data) { struct subprocess_info *sub_info; + struct umh_info *info = data; + const char *cmdline = (info->cmdline) ? info->cmdline : "usermodehelper"; sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL); if (!sub_info) return NULL; + sub_info->argv = argv_split(GFP_KERNEL, cmdline, NULL); + if (!sub_info->argv) { + kfree(sub_info); + return NULL; + } + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); sub_info->path = "none"; sub_info->file = file; @@ -458,10 +466,11 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) return 0; } -static void umh_save_pid(struct subprocess_info *info) +static void umh_clean_and_save_pid(struct subprocess_info *info) { struct umh_info *umh_info = info->data; + argv_free(info->argv); umh_info->pid = info->pid; } @@ -471,6 +480,9 @@ static void umh_save_pid(struct subprocess_info *info) * @len: length of the blob * @info: information about usermode process (shouldn't be NULL) * + * If info->cmdline is set it will be used as command line for the + * user process, else "usermodehelper" is used. + * * Returns either negative error or zero which indicates success * in executing a blob of bytes as a usermode process. In such * case 'struct umh_info *info' is populated with two pipes @@ -500,7 +512,7 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) err = -ENOMEM; sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup, - umh_save_pid, info); + umh_clean_and_save_pid, info); if (!sub_info) goto out; -- cgit v1.2.3 From 9b6f7e163cd0f468d1b9696b785659d3c27c8667 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Oct 2018 15:03:19 -0700 Subject: mm: rework memcg kernel stack accounting If CONFIG_VMAP_STACK is set, kernel stacks are allocated using __vmalloc_node_range() with __GFP_ACCOUNT. So kernel stack pages are charged against corresponding memory cgroups on allocation and uncharged on releasing them. The problem is that we do cache kernel stacks in small per-cpu caches and do reuse them for new tasks, which can belong to different memory cgroups. Each stack page still holds a reference to the original cgroup, so the cgroup can't be released until the vmap area is released. To make this happen we need more than two subsequent exits without forks in between on the current cpu, which makes it very unlikely to happen. As a result, I saw a significant number of dying cgroups (in theory, up to 2 * number_of_cpu + number_of_tasks), which can't be released even by significant memory pressure. As a cgroup structure can take a significant amount of memory (first of all, per-cpu data like memcg statistics), it leads to a noticeable waste of memory. Link: http://lkml.kernel.org/r/20180827162621.30187-1-guro@fb.com Fixes: ac496bf48d97 ("fork: Optimize task creation by caching two thread stacks per CPU if CONFIG_VMAP_STACK=y") Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Andy Lutomirski Cc: Konstantin Khlebnikov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f0b58479534f..3c719fec46c5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -223,9 +223,14 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) return s->addr; } + /* + * Allocated stacks are cached and later reused by new threads, + * so memcg accounting is performed manually on assigning/releasing + * stacks to tasks. Drop __GFP_ACCOUNT. + */ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, + THREADINFO_GFP & ~__GFP_ACCOUNT, PAGE_KERNEL, 0, node, __builtin_return_address(0)); @@ -248,9 +253,19 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) static inline void free_thread_stack(struct task_struct *tsk) { #ifdef CONFIG_VMAP_STACK - if (task_stack_vm_area(tsk)) { + struct vm_struct *vm = task_stack_vm_area(tsk); + + if (vm) { int i; + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + mod_memcg_page_state(vm->pages[i], + MEMCG_KERNEL_STACK_KB, + -(int)(PAGE_SIZE / 1024)); + + memcg_kmem_uncharge(vm->pages[i], 0); + } + for (i = 0; i < NR_CACHED_STACKS; i++) { if (this_cpu_cmpxchg(cached_stacks[i], NULL, tsk->stack_vm_area) != NULL) @@ -351,10 +366,6 @@ static void account_kernel_stack(struct task_struct *tsk, int account) NR_KERNEL_STACK_KB, PAGE_SIZE / 1024 * account); } - - /* All stack pages belong to the same memcg. */ - mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); } else { /* * All stack pages are in the same zone and belong to the @@ -370,6 +381,35 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } } +static int memcg_charge_kernel_stack(struct task_struct *tsk) +{ +#ifdef CONFIG_VMAP_STACK + struct vm_struct *vm = task_stack_vm_area(tsk); + int ret; + + if (vm) { + int i; + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + /* + * If memcg_kmem_charge() fails, page->mem_cgroup + * pointer is NULL, and both memcg_kmem_uncharge() + * and mod_memcg_page_state() in free_thread_stack() + * will ignore this page. So it's safe. + */ + ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0); + if (ret) + return ret; + + mod_memcg_page_state(vm->pages[i], + MEMCG_KERNEL_STACK_KB, + PAGE_SIZE / 1024); + } + } +#endif + return 0; +} + static void release_task_stack(struct task_struct *tsk) { if (WARN_ON(tsk->state != TASK_DEAD)) @@ -807,6 +847,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!stack) goto free_tsk; + if (memcg_charge_kernel_stack(tsk)) + goto free_stack; + stack_vm_area = task_stack_vm_area(tsk); err = arch_dup_task_struct(tsk, orig); -- cgit v1.2.3 From b1d29ba82cf2bc784f4c963ddd6a2cf29e229b33 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:08 -0700 Subject: delayacct: track delays from thrashing cache pages Delay accounting already measures the time a task spends in direct reclaim and waiting for swapin, but in low memory situations tasks spend can spend a significant amount of their time waiting on thrashing page cache. This isn't tracked right now. To know the full impact of memory contention on an individual task, measure the delay when waiting for a recently evicted active cache page to read back into memory. Also update tools/accounting/getdelays.c: [hannes@computer accounting]$ sudo ./getdelays -d -p 1 print delayacct stats ON PID 1 CPU count real total virtual total delay total delay average 50318 745000000 847346785 400533713 0.008ms IO count delay total delay average 435 122601218 0ms SWAP count delay total delay average 0 0 0ms RECLAIM count delay total delay average 0 0 0ms THRASHING count delay total delay average 19 12621439 0ms Link: http://lkml.kernel.org/r/20180828172258.3185-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/delayacct.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ca8ac2824f0b..2a12b988c717 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -135,9 +135,12 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; tmp = d->freepages_delay_total + tsk->delays->freepages_delay; d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; + tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; + d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; + d->thrashing_count += tsk->delays->thrashing_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -169,3 +172,15 @@ void __delayacct_freepages_end(void) ¤t->delays->freepages_count); } +void __delayacct_thrashing_start(void) +{ + current->delays->thrashing_start = ktime_get_ns(); +} + +void __delayacct_thrashing_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->thrashing_start, + ¤t->delays->thrashing_delay, + ¤t->delays->thrashing_count); +} -- cgit v1.2.3 From 8508cf3ffad4defa202b303e5b6379efc4cd9054 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:11 -0700 Subject: sched: loadavg: consolidate LOAD_INT, LOAD_FRAC, CALC_LOAD There are several definitions of those functions/macros in places that mess with fixed-point load averages. Provide an official version. [akpm@linux-foundation.org: fix missed conversion in block/blk-iolatency.c] Link: http://lkml.kernel.org/r/20180828172258.3185-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Suren Baghdasaryan Tested-by: Daniel Drake Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/debug/kdb/kdb_main.c | 7 +------ kernel/sched/loadavg.c | 15 --------------- 2 files changed, 1 insertion(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2ddfce8f1e8f..bb4fe4e1a601 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2556,16 +2556,11 @@ static int kdb_summary(int argc, const char **argv) } kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); - /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */ - -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); -#undef LOAD_INT -#undef LOAD_FRAC + /* Display in kilobytes */ #define K(x) ((x) << (PAGE_SHIFT - 10)) kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index a171c1258109..54fbdfb2d86c 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -91,21 +91,6 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) return delta; } -/* - * a1 = a0 * e + a * (1 - e) - */ -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - unsigned long newload; - - newload = load * exp + active * (FIXED_1 - exp); - if (active >= load) - newload += FIXED_1-1; - - return newload / FIXED_1; -} - #ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. -- cgit v1.2.3 From 5c54f5b9edb1aa2eabbb1091c458f1b6776a1896 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:16 -0700 Subject: sched: loadavg: make calc_load_n() public It's going to be used in a later patch. Keep the churn separate. Link: http://lkml.kernel.org/r/20180828172258.3185-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Suren Baghdasaryan Tested-by: Daniel Drake Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/loadavg.c | 138 ++++++++++++++++++++++++------------------------- 1 file changed, 69 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 54fbdfb2d86c..28a516575c18 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -91,6 +91,75 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) return delta; } +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + #ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. @@ -210,75 +279,6 @@ static long calc_load_nohz_fold(void) return delta; } -/** - * fixed_power_int - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - */ -static unsigned long -fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) { - for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - } - - return result; -} - -/* - * a1 = a0 * e + a * (1 - e) - * - * a2 = a1 * e + a * (1 - e) - * = (a0 * e + a * (1 - e)) * e + a * (1 - e) - * = a0 * e^2 + a * (1 - e) * (1 + e) - * - * a3 = a2 * e + a * (1 - e) - * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) - * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) - * - * ... - * - * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] - * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) - * = a0 * e^n + a * (1 - e^n) - * - * [1] application of the geometric series: - * - * n 1 - x^(n+1) - * S_n := \Sum x^i = ------------- - * i=0 1 - x - */ -static unsigned long -calc_load_n(unsigned long load, unsigned long exp, - unsigned long active, unsigned int n) -{ - return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -} - /* * NO_HZ can leave us missing all per-CPU ticks calling * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into -- cgit v1.2.3 From 1f351d7f7590857ea281579c26e6045b4c548ef4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:19 -0700 Subject: sched: sched.h: make rq locking and clock functions available in stats.h kernel/sched/sched.h includes "stats.h" half-way through the file. The next patch introduces users of sched.h's rq locking functions and update_rq_clock() in kernel/sched/stats.h. Move those definitions up in the file so they are available in stats.h. Link: http://lkml.kernel.org/r/20180828172258.3185-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Suren Baghdasaryan Tested-by: Daniel Drake Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/sched.h | 164 +++++++++++++++++++++++++-------------------------- 1 file changed, 82 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b8c007713b3b..65a75b317935 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -957,6 +957,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +extern void update_rq_clock(struct rq *rq); + static inline u64 __rq_clock_broken(struct rq *rq) { return READ_ONCE(rq->clock); @@ -1075,6 +1077,86 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) #endif } +struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(rq->lock); + +struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + __acquires(p->pi_lock) + __acquires(rq->lock); + +static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); +} + +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, @@ -1717,8 +1799,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } -extern void update_rq_clock(struct rq *rq); - extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); @@ -1783,86 +1863,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) #endif #endif -struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(rq->lock); - -struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) - __acquires(p->pi_lock) - __acquires(rq->lock); - -static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) - __releases(rq->lock) - __releases(p->pi_lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -} - -static inline void -rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irqsave(&rq->lock, rf->flags); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock_irq(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock_irq(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_lock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, rf); -} - -static inline void -rq_relock(struct rq *rq, struct rq_flags *rf) - __acquires(rq->lock) -{ - raw_spin_lock(&rq->lock); - rq_repin_lock(rq, rf); -} - -static inline void -rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -} - -static inline void -rq_unlock_irq(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock_irq(&rq->lock); -} - -static inline void -rq_unlock(struct rq *rq, struct rq_flags *rf) - __releases(rq->lock) -{ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); -} - #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT -- cgit v1.2.3 From 246b3b3342c9b0a2e24cda2178be87bc36e1c874 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:23 -0700 Subject: sched: introduce this_rq_lock_irq() do_sched_yield() disables IRQs, looks up this_rq() and locks it. The next patch is adding another site with the same pattern, so provide a convenience function for it. Link: http://lkml.kernel.org/r/20180828172258.3185-8-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Suren Baghdasaryan Tested-by: Daniel Drake Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 4 +--- kernel/sched/sched.h | 12 ++++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2e696b03e99d..f3efef387797 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4933,9 +4933,7 @@ static void do_sched_yield(void) struct rq_flags rf; struct rq *rq; - local_irq_disable(); - rq = this_rq(); - rq_lock(rq, &rf); + rq = this_rq_lock_irq(&rf); schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 65a75b317935..1de189bb9209 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1157,6 +1157,18 @@ rq_unlock(struct rq *rq, struct rq_flags *rf) raw_spin_unlock(&rq->lock); } +static inline struct rq * +this_rq_lock_irq(struct rq_flags *rf) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, rf); + return rq; +} + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, -- cgit v1.2.3 From eb414681d5a07d28d2ff90dc05f69ec6b232ebd2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:27 -0700 Subject: psi: pressure stall information for CPU, memory, and IO When systems are overcommitted and resources become contended, it's hard to tell exactly the impact this has on workload productivity, or how close the system is to lockups and OOM kills. In particular, when machines work multiple jobs concurrently, the impact of overcommit in terms of latency and throughput on the individual job can be enormous. In order to maximize hardware utilization without sacrificing individual job health or risk complete machine lockups, this patch implements a way to quantify resource pressure in the system. A kernel built with CONFIG_PSI=y creates files in /proc/pressure/ that expose the percentage of time the system is stalled on CPU, memory, or IO, respectively. Stall states are aggregate versions of the per-task delay accounting delays: cpu: some tasks are runnable but not executing on a CPU memory: tasks are reclaiming, or waiting for swapin or thrashing cache io: tasks are waiting for io completions These percentages of walltime can be thought of as pressure percentages, and they give a general sense of system health and productivity loss incurred by resource overcommit. They can also indicate when the system is approaching lockup scenarios and OOMs. To do this, psi keeps track of the task states associated with each CPU and samples the time they spend in stall states. Every 2 seconds, the samples are averaged across CPUs - weighted by the CPUs' non-idle time to eliminate artifacts from unused CPUs - and translated into percentages of walltime. A running average of those percentages is maintained over 10s, 1m, and 5m periods (similar to the loadaverage). [hannes@cmpxchg.org: doc fixlet, per Randy] Link: http://lkml.kernel.org/r/20180828205625.GA14030@cmpxchg.org [hannes@cmpxchg.org: code optimization] Link: http://lkml.kernel.org/r/20180907175015.GA8479@cmpxchg.org [hannes@cmpxchg.org: rename psi_clock() to psi_update_work(), per Peter] Link: http://lkml.kernel.org/r/20180907145404.GB11088@cmpxchg.org [hannes@cmpxchg.org: fix build] Link: http://lkml.kernel.org/r/20180913014222.GA2370@cmpxchg.org Link: http://lkml.kernel.org/r/20180828172258.3185-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Tejun Heo Cc: Vinayak Menon Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 12 +- kernel/sched/psi.c | 657 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 2 + kernel/sched/stats.h | 86 +++++++ 6 files changed, 760 insertions(+), 2 deletions(-) create mode 100644 kernel/sched/psi.c (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 3c719fec46c5..8f82a3bdcb8f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1822,6 +1822,10 @@ static __latent_entropy struct task_struct *copy_process( p->default_timer_slack_ns = current->timer_slack_ns; +#ifdef CONFIG_PSI + p->psi_flags = 0; +#endif + task_io_accounting_init(&p->ioac); acct_clear_integrals(p); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7fe183404c38..21fb5a5662b5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o +obj-$(CONFIG_PSI) += psi.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3efef387797..fd2fce8a001b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -722,8 +722,10 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & ENQUEUE_RESTORE)) + if (!(flags & ENQUEUE_RESTORE)) { sched_info_queued(rq, p); + psi_enqueue(p, flags & ENQUEUE_WAKEUP); + } p->sched_class->enqueue_task(rq, p, flags); } @@ -733,8 +735,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); - if (!(flags & DEQUEUE_SAVE)) + if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeued(rq, p); + psi_dequeue(p, flags & DEQUEUE_SLEEP); + } p->sched_class->dequeue_task(rq, p, flags); } @@ -2037,6 +2041,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; + psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } @@ -3051,6 +3056,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); + psi_task_tick(rq); rq_unlock(rq, &rf); @@ -6067,6 +6073,8 @@ void __init sched_init(void) init_schedstats(); + psi_init(); + scheduler_running = 1; } diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c new file mode 100644 index 000000000000..595414599b98 --- /dev/null +++ b/kernel/sched/psi.c @@ -0,0 +1,657 @@ +/* + * Pressure stall information for CPU, memory and IO + * + * Copyright (c) 2018 Facebook, Inc. + * Author: Johannes Weiner + * + * When CPU, memory and IO are contended, tasks experience delays that + * reduce throughput and introduce latencies into the workload. Memory + * and IO contention, in addition, can cause a full loss of forward + * progress in which the CPU goes idle. + * + * This code aggregates individual task delays into resource pressure + * metrics that indicate problems with both workload health and + * resource utilization. + * + * Model + * + * The time in which a task can execute on a CPU is our baseline for + * productivity. Pressure expresses the amount of time in which this + * potential cannot be realized due to resource contention. + * + * This concept of productivity has two components: the workload and + * the CPU. To measure the impact of pressure on both, we define two + * contention states for a resource: SOME and FULL. + * + * In the SOME state of a given resource, one or more tasks are + * delayed on that resource. This affects the workload's ability to + * perform work, but the CPU may still be executing other tasks. + * + * In the FULL state of a given resource, all non-idle tasks are + * delayed on that resource such that nobody is advancing and the CPU + * goes idle. This leaves both workload and CPU unproductive. + * + * (Naturally, the FULL state doesn't exist for the CPU resource.) + * + * SOME = nr_delayed_tasks != 0 + * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * + * The percentage of wallclock time spent in those compound stall + * states gives pressure numbers between 0 and 100 for each resource, + * where the SOME percentage indicates workload slowdowns and the FULL + * percentage indicates reduced CPU utilization: + * + * %SOME = time(SOME) / period + * %FULL = time(FULL) / period + * + * Multiple CPUs + * + * The more tasks and available CPUs there are, the more work can be + * performed concurrently. This means that the potential that can go + * unrealized due to resource contention *also* scales with non-idle + * tasks and CPUs. + * + * Consider a scenario where 257 number crunching tasks are trying to + * run concurrently on 256 CPUs. If we simply aggregated the task + * states, we would have to conclude a CPU SOME pressure number of + * 100%, since *somebody* is waiting on a runqueue at all + * times. However, that is clearly not the amount of contention the + * workload is experiencing: only one out of 256 possible exceution + * threads will be contended at any given time, or about 0.4%. + * + * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any + * given time *one* of the tasks is delayed due to a lack of memory. + * Again, looking purely at the task state would yield a memory FULL + * pressure number of 0%, since *somebody* is always making forward + * progress. But again this wouldn't capture the amount of execution + * potential lost, which is 1 out of 4 CPUs, or 25%. + * + * To calculate wasted potential (pressure) with multiple processors, + * we have to base our calculation on the number of non-idle tasks in + * conjunction with the number of available CPUs, which is the number + * of potential execution threads. SOME becomes then the proportion of + * delayed tasks to possibe threads, and FULL is the share of possible + * threads that are unproductive due to delays: + * + * threads = min(nr_nonidle_tasks, nr_cpus) + * SOME = min(nr_delayed_tasks / threads, 1) + * FULL = (threads - min(nr_running_tasks, threads)) / threads + * + * For the 257 number crunchers on 256 CPUs, this yields: + * + * threads = min(257, 256) + * SOME = min(1 / 256, 1) = 0.4% + * FULL = (256 - min(257, 256)) / 256 = 0% + * + * For the 1 out of 4 memory-delayed tasks, this yields: + * + * threads = min(4, 4) + * SOME = min(1 / 4, 1) = 25% + * FULL = (4 - min(3, 4)) / 4 = 25% + * + * [ Substitute nr_cpus with 1, and you can see that it's a natural + * extension of the single-CPU model. ] + * + * Implementation + * + * To assess the precise time spent in each such state, we would have + * to freeze the system on task changes and start/stop the state + * clocks accordingly. Obviously that doesn't scale in practice. + * + * Because the scheduler aims to distribute the compute load evenly + * among the available CPUs, we can track task state locally to each + * CPU and, at much lower frequency, extrapolate the global state for + * the cumulative stall times and the running averages. + * + * For each runqueue, we track: + * + * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) + * + * and then periodically aggregate: + * + * tNONIDLE = sum(tNONIDLE[i]) + * + * tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE + * tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE + * + * %SOME = tSOME / period + * %FULL = tFULL / period + * + * This gives us an approximation of pressure that is practical + * cost-wise, yet way more sensitive and accurate than periodic + * sampling of the aggregate task states would be. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "sched.h" + +static int psi_bug __read_mostly; + +bool psi_disabled __read_mostly; +core_param(psi_disabled, psi_disabled, bool, 0644); + +/* Running averages - we need to be higher-res than loadavg */ +#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */ +#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */ +#define EXP_60s 1981 /* 1/exp(2s/60s) */ +#define EXP_300s 2034 /* 1/exp(2s/300s) */ + +/* Sampling frequency in nanoseconds */ +static u64 psi_period __read_mostly; + +/* System-level pressure and stall tracking */ +static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); +static struct psi_group psi_system = { + .pcpu = &system_group_pcpu, +}; + +static void psi_update_work(struct work_struct *work); + +static void group_init(struct psi_group *group) +{ + int cpu; + + for_each_possible_cpu(cpu) + seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); + group->next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->clock_work, psi_update_work); + mutex_init(&group->stat_lock); +} + +void __init psi_init(void) +{ + if (psi_disabled) + return; + + psi_period = jiffies_to_nsecs(PSI_FREQ); + group_init(&psi_system); +} + +static bool test_state(unsigned int *tasks, enum psi_states state) +{ + switch (state) { + case PSI_IO_SOME: + return tasks[NR_IOWAIT]; + case PSI_IO_FULL: + return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; + case PSI_MEM_SOME: + return tasks[NR_MEMSTALL]; + case PSI_MEM_FULL: + return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; + case PSI_CPU_SOME: + return tasks[NR_RUNNING] > 1; + case PSI_NONIDLE: + return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || + tasks[NR_RUNNING]; + default: + return false; + } +} + +static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +{ + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + unsigned int tasks[NR_PSI_TASK_COUNTS]; + u64 now, state_start; + unsigned int seq; + int s; + + /* Snapshot a coherent view of the CPU state */ + do { + seq = read_seqcount_begin(&groupc->seq); + now = cpu_clock(cpu); + memcpy(times, groupc->times, sizeof(groupc->times)); + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_start = groupc->state_start; + } while (read_seqcount_retry(&groupc->seq, seq)); + + /* Calculate state time deltas against the previous snapshot */ + for (s = 0; s < NR_PSI_STATES; s++) { + u32 delta; + /* + * In addition to already concluded states, we also + * incorporate currently active states on the CPU, + * since states may last for many sampling periods. + * + * This way we keep our delta sampling buckets small + * (u32) and our reported pressure close to what's + * actually happening. + */ + if (test_state(tasks, s)) + times[s] += now - state_start; + + delta = times[s] - groupc->times_prev[s]; + groupc->times_prev[s] = times[s]; + + times[s] = delta; + } +} + +static void calc_avgs(unsigned long avg[3], int missed_periods, + u64 time, u64 period) +{ + unsigned long pct; + + /* Fill in zeroes for periods of no activity */ + if (missed_periods) { + avg[0] = calc_load_n(avg[0], EXP_10s, 0, missed_periods); + avg[1] = calc_load_n(avg[1], EXP_60s, 0, missed_periods); + avg[2] = calc_load_n(avg[2], EXP_300s, 0, missed_periods); + } + + /* Sample the most recent active period */ + pct = div_u64(time * 100, period); + pct *= FIXED_1; + avg[0] = calc_load(avg[0], EXP_10s, pct); + avg[1] = calc_load(avg[1], EXP_60s, pct); + avg[2] = calc_load(avg[2], EXP_300s, pct); +} + +static bool update_stats(struct psi_group *group) +{ + u64 deltas[NR_PSI_STATES - 1] = { 0, }; + unsigned long missed_periods = 0; + unsigned long nonidle_total = 0; + u64 now, expires, period; + int cpu; + int s; + + mutex_lock(&group->stat_lock); + + /* + * Collect the per-cpu time buckets and average them into a + * single time sample that is normalized to wallclock time. + * + * For averaging, each CPU is weighted by its non-idle time in + * the sampling period. This eliminates artifacts from uneven + * loading, or even entirely idle CPUs. + */ + for_each_possible_cpu(cpu) { + u32 times[NR_PSI_STATES]; + u32 nonidle; + + get_recent_times(group, cpu, times); + + nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); + nonidle_total += nonidle; + + for (s = 0; s < PSI_NONIDLE; s++) + deltas[s] += (u64)times[s] * nonidle; + } + + /* + * Integrate the sample into the running statistics that are + * reported to userspace: the cumulative stall times and the + * decaying averages. + * + * Pressure percentages are sampled at PSI_FREQ. We might be + * called more often when the user polls more frequently than + * that; we might be called less often when there is no task + * activity, thus no data, and clock ticks are sporadic. The + * below handles both. + */ + + /* total= */ + for (s = 0; s < NR_PSI_STATES - 1; s++) + group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + + /* avgX= */ + now = sched_clock(); + expires = group->next_update; + if (now < expires) + goto out; + if (now - expires > psi_period) + missed_periods = div_u64(now - expires, psi_period); + + /* + * The periodic clock tick can get delayed for various + * reasons, especially on loaded systems. To avoid clock + * drift, we schedule the clock in fixed psi_period intervals. + * But the deltas we sample out of the per-cpu buckets above + * are based on the actual time elapsing between clock ticks. + */ + group->next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->last_update + (missed_periods * psi_period)); + group->last_update = now; + + for (s = 0; s < NR_PSI_STATES - 1; s++) { + u32 sample; + + sample = group->total[s] - group->total_prev[s]; + /* + * Due to the lockless sampling of the time buckets, + * recorded time deltas can slip into the next period, + * which under full pressure can result in samples in + * excess of the period length. + * + * We don't want to report non-sensical pressures in + * excess of 100%, nor do we want to drop such events + * on the floor. Instead we punt any overage into the + * future until pressure subsides. By doing this we + * don't underreport the occurring pressure curve, we + * just report it delayed by one period length. + * + * The error isn't cumulative. As soon as another + * delta slips from a period P to P+1, by definition + * it frees up its time T in P. + */ + if (sample > period) + sample = period; + group->total_prev[s] += sample; + calc_avgs(group->avg[s], missed_periods, sample, period); + } +out: + mutex_unlock(&group->stat_lock); + return nonidle_total; +} + +static void psi_update_work(struct work_struct *work) +{ + struct delayed_work *dwork; + struct psi_group *group; + bool nonidle; + + dwork = to_delayed_work(work); + group = container_of(dwork, struct psi_group, clock_work); + + /* + * If there is task activity, periodically fold the per-cpu + * times and feed samples into the running averages. If things + * are idle and there is no data to process, stop the clock. + * Once restarted, we'll catch up the running averages in one + * go - see calc_avgs() and missed_periods. + */ + + nonidle = update_stats(group); + + if (nonidle) { + unsigned long delay = 0; + u64 now; + + now = sched_clock(); + if (group->next_update > now) + delay = nsecs_to_jiffies(group->next_update - now) + 1; + schedule_delayed_work(dwork, delay); + } +} + +static void record_times(struct psi_group_cpu *groupc, int cpu, + bool memstall_tick) +{ + u32 delta; + u64 now; + + now = cpu_clock(cpu); + delta = now - groupc->state_start; + groupc->state_start = now; + + if (test_state(groupc->tasks, PSI_IO_SOME)) { + groupc->times[PSI_IO_SOME] += delta; + if (test_state(groupc->tasks, PSI_IO_FULL)) + groupc->times[PSI_IO_FULL] += delta; + } + + if (test_state(groupc->tasks, PSI_MEM_SOME)) { + groupc->times[PSI_MEM_SOME] += delta; + if (test_state(groupc->tasks, PSI_MEM_FULL)) + groupc->times[PSI_MEM_FULL] += delta; + else if (memstall_tick) { + u32 sample; + /* + * Since we care about lost potential, a + * memstall is FULL when there are no other + * working tasks, but also when the CPU is + * actively reclaiming and nothing productive + * could run even if it were runnable. + * + * When the timer tick sees a reclaiming CPU, + * regardless of runnable tasks, sample a FULL + * tick (or less if it hasn't been a full tick + * since the last state change). + */ + sample = min(delta, (u32)jiffies_to_nsecs(1)); + groupc->times[PSI_MEM_FULL] += sample; + } + } + + if (test_state(groupc->tasks, PSI_CPU_SOME)) + groupc->times[PSI_CPU_SOME] += delta; + + if (test_state(groupc->tasks, PSI_NONIDLE)) + groupc->times[PSI_NONIDLE] += delta; +} + +static void psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) +{ + struct psi_group_cpu *groupc; + unsigned int t, m; + + groupc = per_cpu_ptr(group->pcpu, cpu); + + /* + * First we assess the aggregate resource states this CPU's + * tasks have been in since the last change, and account any + * SOME and FULL time these may have resulted in. + * + * Then we update the task counts according to the state + * change requested through the @clear and @set bits. + */ + write_seqcount_begin(&groupc->seq); + + record_times(groupc, cpu, false); + + for (t = 0, m = clear; m; m &= ~(1 << t), t++) { + if (!(m & (1 << t))) + continue; + if (groupc->tasks[t] == 0 && !psi_bug) { + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + cpu, t, groupc->tasks[0], + groupc->tasks[1], groupc->tasks[2], + clear, set); + psi_bug = 1; + } + groupc->tasks[t]--; + } + + for (t = 0; set; set &= ~(1 << t), t++) + if (set & (1 << t)) + groupc->tasks[t]++; + + write_seqcount_end(&groupc->seq); + + if (!delayed_work_pending(&group->clock_work)) + schedule_delayed_work(&group->clock_work, PSI_FREQ); +} + +void psi_task_change(struct task_struct *task, int clear, int set) +{ + int cpu = task_cpu(task); + + if (!task->pid) + return; + + if (((task->psi_flags & set) || + (task->psi_flags & clear) != clear) && + !psi_bug) { + printk_deferred(KERN_ERR "psi: inconsistent task state! task=%d:%s cpu=%d psi_flags=%x clear=%x set=%x\n", + task->pid, task->comm, cpu, + task->psi_flags, clear, set); + psi_bug = 1; + } + + task->psi_flags &= ~clear; + task->psi_flags |= set; + + psi_group_change(&psi_system, cpu, clear, set); +} + +void psi_memstall_tick(struct task_struct *task, int cpu) +{ + struct psi_group_cpu *groupc; + + groupc = per_cpu_ptr(psi_system.pcpu, cpu); + write_seqcount_begin(&groupc->seq); + record_times(groupc, cpu, true); + write_seqcount_end(&groupc->seq); +} + +/** + * psi_memstall_enter - mark the beginning of a memory stall section + * @flags: flags to handle nested sections + * + * Marks the calling task as being stalled due to a lack of memory, + * such as waiting for a refault or performing reclaim. + */ +void psi_memstall_enter(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + *flags = current->flags & PF_MEMSTALL; + if (*flags) + return; + /* + * PF_MEMSTALL setting & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we can + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags |= PF_MEMSTALL; + psi_task_change(current, 0, TSK_MEMSTALL); + + rq_unlock_irq(rq, &rf); +} + +/** + * psi_memstall_leave - mark the end of an memory stall section + * @flags: flags to handle nested memdelay sections + * + * Marks the calling task as no longer stalled due to lack of memory. + */ +void psi_memstall_leave(unsigned long *flags) +{ + struct rq_flags rf; + struct rq *rq; + + if (psi_disabled) + return; + + if (*flags) + return; + /* + * PF_MEMSTALL clearing & accounting needs to be atomic wrt + * changes to the task's scheduling state, otherwise we could + * race with CPU migration. + */ + rq = this_rq_lock_irq(&rf); + + current->flags &= ~PF_MEMSTALL; + psi_task_change(current, TSK_MEMSTALL, 0); + + rq_unlock_irq(rq, &rf); +} + +static int psi_show(struct seq_file *m, struct psi_group *group, + enum psi_res res) +{ + int full; + + if (psi_disabled) + return -EOPNOTSUPP; + + update_stats(group); + + for (full = 0; full < 2 - (res == PSI_CPU); full++) { + unsigned long avg[3]; + u64 total; + int w; + + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + + seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", + full ? "full" : "some", + LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), + LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), + LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), + total); + } + + return 0; +} + +static int psi_io_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IO); +} + +static int psi_memory_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_MEM); +} + +static int psi_cpu_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_CPU); +} + +static int psi_io_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_io_show, NULL); +} + +static int psi_memory_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_memory_show, NULL); +} + +static int psi_cpu_open(struct inode *inode, struct file *file) +{ + return single_open(file, psi_cpu_show, NULL); +} + +static const struct file_operations psi_io_fops = { + .open = psi_io_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_memory_fops = { + .open = psi_memory_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations psi_cpu_fops = { + .open = psi_cpu_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init psi_proc_init(void) +{ + proc_mkdir("pressure", NULL); + proc_create("pressure/io", 0, NULL, &psi_io_fops); + proc_create("pressure/memory", 0, NULL, &psi_memory_fops); + proc_create("pressure/cpu", 0, NULL, &psi_cpu_fops); + return 0; +} +module_init(psi_proc_init); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1de189bb9209..618577fc9aa8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -319,6 +320,7 @@ extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED #include +#include struct cfs_rq; struct rt_rq; diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8aea199a39b4..4904c4677000 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -55,6 +55,92 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt # define schedstat_val_or_zero(var) 0 #endif /* CONFIG_SCHEDSTATS */ +#ifdef CONFIG_PSI +/* + * PSI tracks state that persists across sleeps, such as iowaits and + * memory stalls. As a result, it has to distinguish between sleeps, + * where a task's runnable state changes, and requeues, where a task + * and its state are being moved between CPUs and runqueues. + */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) +{ + int clear = 0, set = TSK_RUNNING; + + if (psi_disabled) + return; + + if (!wakeup || p->sched_psi_wake_requeue) { + if (p->flags & PF_MEMSTALL) + set |= TSK_MEMSTALL; + if (p->sched_psi_wake_requeue) + p->sched_psi_wake_requeue = 0; + } else { + if (p->in_iowait) + clear |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_dequeue(struct task_struct *p, bool sleep) +{ + int clear = TSK_RUNNING, set = 0; + + if (psi_disabled) + return; + + if (!sleep) { + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + } else { + if (p->in_iowait) + set |= TSK_IOWAIT; + } + + psi_task_change(p, clear, set); +} + +static inline void psi_ttwu_dequeue(struct task_struct *p) +{ + if (psi_disabled) + return; + /* + * Is the task being migrated during a wakeup? Make sure to + * deregister its sleep-persistent psi states from the old + * queue, and let psi_enqueue() know it has to requeue. + */ + if (unlikely(p->in_iowait || (p->flags & PF_MEMSTALL))) { + struct rq_flags rf; + struct rq *rq; + int clear = 0; + + if (p->in_iowait) + clear |= TSK_IOWAIT; + if (p->flags & PF_MEMSTALL) + clear |= TSK_MEMSTALL; + + rq = __task_rq_lock(p, &rf); + psi_task_change(p, clear, 0); + p->sched_psi_wake_requeue = 1; + __task_rq_unlock(rq, &rf); + } +} + +static inline void psi_task_tick(struct rq *rq) +{ + if (psi_disabled) + return; + + if (unlikely(rq->curr->flags & PF_MEMSTALL)) + psi_memstall_tick(rq->curr, cpu_of(rq)); +} +#else /* CONFIG_PSI */ +static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} +static inline void psi_dequeue(struct task_struct *p, bool sleep) {} +static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_task_tick(struct rq *rq) {} +#endif /* CONFIG_PSI */ + #ifdef CONFIG_SCHED_INFO static inline void sched_info_reset_dequeued(struct task_struct *t) { -- cgit v1.2.3 From 2ce7135adc9ad081aa3c49744144376ac74fea60 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 26 Oct 2018 15:06:31 -0700 Subject: psi: cgroup support On a system that executes multiple cgrouped jobs and independent workloads, we don't just care about the health of the overall system, but also that of individual jobs, so that we can ensure individual job health, fairness between jobs, or prioritize some jobs over others. This patch implements pressure stall tracking for cgroups. In kernels with CONFIG_PSI=y, cgroup2 groups will have cpu.pressure, memory.pressure, and io.pressure files that track aggregate pressure stall times for only the tasks inside the cgroup. Link: http://lkml.kernel.org/r/20180828172258.3185-10-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Tejun Heo Acked-by: Peter Zijlstra (Intel) Tested-by: Daniel Drake Tested-by: Suren Baghdasaryan Cc: Christopher Lameter Cc: Ingo Molnar Cc: Johannes Weiner Cc: Mike Galbraith Cc: Peter Enderborg Cc: Randy Dunlap Cc: Shakeel Butt Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cgroup.c | 45 ++++++++++++++++++- kernel/sched/psi.c | 118 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 153 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4c1cf0969a80..8b79318810ad 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -862,7 +863,7 @@ static void css_set_move_task(struct task_struct *task, */ WARN_ON_ONCE(task->flags & PF_EXITING); - rcu_assign_pointer(task->cgroups, to_cset); + cgroup_move_task(task, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } @@ -3446,6 +3447,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) return ret; } +#ifdef CONFIG_PSI +static int cgroup_io_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); +} +static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); +} +static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) +{ + return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); +} +#endif + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4576,6 +4592,23 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, }, +#ifdef CONFIG_PSI + { + .name = "io.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_io_pressure_show, + }, + { + .name = "memory.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_memory_pressure_show, + }, + { + .name = "cpu.pressure", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_cpu_pressure_show, + }, +#endif { } /* terminate */ }; @@ -4636,6 +4669,7 @@ static void css_free_rwork_fn(struct work_struct *work) */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); + psi_cgroup_free(cgrp); if (cgroup_on_dfl(cgrp)) cgroup_rstat_exit(cgrp); kfree(cgrp); @@ -4892,10 +4926,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; - ret = cgroup_bpf_inherit(cgrp); + + ret = psi_cgroup_alloc(cgrp); if (ret) goto out_idr_free; + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_psi_free; + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; @@ -4933,6 +4972,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) return cgrp; +out_psi_free: + psi_cgroup_free(cgrp); out_idr_free: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_stat_exit: diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 595414599b98..7cdecfc010af 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -473,9 +473,35 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->clock_work, PSI_FREQ); } +static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +{ +#ifdef CONFIG_CGROUPS + struct cgroup *cgroup = NULL; + + if (!*iter) + cgroup = task->cgroups->dfl_cgrp; + else if (*iter == &psi_system) + return NULL; + else + cgroup = cgroup_parent(*iter); + + if (cgroup && cgroup_parent(cgroup)) { + *iter = cgroup; + return cgroup_psi(cgroup); + } +#else + if (*iter) + return NULL; +#endif + *iter = &psi_system; + return &psi_system; +} + void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); + struct psi_group *group; + void *iter = NULL; if (!task->pid) return; @@ -492,17 +518,23 @@ void psi_task_change(struct task_struct *task, int clear, int set) task->psi_flags &= ~clear; task->psi_flags |= set; - psi_group_change(&psi_system, cpu, clear, set); + while ((group = iterate_groups(task, &iter))) + psi_group_change(group, cpu, clear, set); } void psi_memstall_tick(struct task_struct *task, int cpu) { - struct psi_group_cpu *groupc; + struct psi_group *group; + void *iter = NULL; - groupc = per_cpu_ptr(psi_system.pcpu, cpu); - write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu, true); - write_seqcount_end(&groupc->seq); + while ((group = iterate_groups(task, &iter))) { + struct psi_group_cpu *groupc; + + groupc = per_cpu_ptr(group->pcpu, cpu); + write_seqcount_begin(&groupc->seq); + record_times(groupc, cpu, true); + write_seqcount_end(&groupc->seq); + } } /** @@ -565,8 +597,78 @@ void psi_memstall_leave(unsigned long *flags) rq_unlock_irq(rq, &rf); } -static int psi_show(struct seq_file *m, struct psi_group *group, - enum psi_res res) +#ifdef CONFIG_CGROUPS +int psi_cgroup_alloc(struct cgroup *cgroup) +{ + if (psi_disabled) + return 0; + + cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); + if (!cgroup->psi.pcpu) + return -ENOMEM; + group_init(&cgroup->psi); + return 0; +} + +void psi_cgroup_free(struct cgroup *cgroup) +{ + if (psi_disabled) + return; + + cancel_delayed_work_sync(&cgroup->psi.clock_work); + free_percpu(cgroup->psi.pcpu); +} + +/** + * cgroup_move_task - move task to a different cgroup + * @task: the task + * @to: the target css_set + * + * Move task to a new cgroup and safely migrate its associated stall + * state between the different groups. + * + * This function acquires the task's rq lock to lock out concurrent + * changes to the task's scheduling state and - in case the task is + * running - concurrent changes to its stall state. + */ +void cgroup_move_task(struct task_struct *task, struct css_set *to) +{ + bool move_psi = !psi_disabled; + unsigned int task_flags = 0; + struct rq_flags rf; + struct rq *rq; + + if (move_psi) { + rq = task_rq_lock(task, &rf); + + if (task_on_rq_queued(task)) + task_flags = TSK_RUNNING; + else if (task->in_iowait) + task_flags = TSK_IOWAIT; + + if (task->flags & PF_MEMSTALL) + task_flags |= TSK_MEMSTALL; + + if (task_flags) + psi_task_change(task, task_flags, 0); + } + + /* + * Lame to do this here, but the scheduler cannot be locked + * from the outside, so we move cgroups from inside sched/. + */ + rcu_assign_pointer(task->cgroups, to); + + if (move_psi) { + if (task_flags) + psi_task_change(task, 0, task_flags); + + task_rq_unlock(rq, task, &rf); + } +} +#endif /* CONFIG_CGROUPS */ + +int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; -- cgit v1.2.3 From 966cf44f637e6aeea7e3d01ba004bf8b5beac78f Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 26 Oct 2018 15:07:52 -0700 Subject: mm: defer ZONE_DEVICE page initialization to the point where we init pgmap The ZONE_DEVICE pages were being initialized in two locations. One was with the memory_hotplug lock held and another was outside of that lock. The problem with this is that it was nearly doubling the memory initialization time. Instead of doing this twice, once while holding a global lock and once without, I am opting to defer the initialization to the one outside of the lock. This allows us to avoid serializing the overhead for memory init and we can instead focus on per-node init times. One issue I encountered is that devm_memremap_pages and hmm_devmmem_pages_create were initializing only the pgmap field the same way. One wasn't initializing hmm_data, and the other was initializing it to a poison value. Since this is something that is exposed to the driver in the case of hmm I am opting for a third option and just initializing hmm_data to 0 since this is going to be exposed to unknown third party drivers. [alexander.h.duyck@linux.intel.com: fix reference count for pgmap in devm_memremap_pages] Link: http://lkml.kernel.org/r/20181008233404.1909.37302.stgit@localhost.localdomain Link: http://lkml.kernel.org/r/20180925202053.3576.66039.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Tested-by: Dan Williams Cc: Dave Hansen Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 5b8600d39931..620fc4d2559a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -175,10 +175,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) struct vmem_altmap *altmap = pgmap->altmap_valid ? &pgmap->altmap : NULL; struct resource *res = &pgmap->res; - unsigned long pfn, pgoff, order; + struct dev_pagemap *conflict_pgmap; pgprot_t pgprot = PAGE_KERNEL; + unsigned long pgoff, order; int error, nid, is_ram; - struct dev_pagemap *conflict_pgmap; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -256,19 +256,14 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (error) goto err_add_memory; - for_each_device_pfn(pfn, pgmap) { - struct page *page = pfn_to_page(pfn); - - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer. It is a bug if a ZONE_DEVICE page is ever - * freed or placed on a driver-private list. Seed the - * storage with LIST_POISON* values. - */ - list_del(&page->lru); - page->pgmap = pgmap; - percpu_ref_get(pgmap->ref); - } + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT, pgmap); + percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); devm_add_action(dev, devm_memremap_pages_release, pgmap); -- cgit v1.2.3