summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/audit.c634
-rw-r--r--kernel/audit.h18
-rw-r--r--kernel/auditsc.c69
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c10
-rw-r--r--kernel/bpf/bpf_lru_list.c20
-rw-r--r--kernel/bpf/core.c256
-rw-r--r--kernel/bpf/hashtab.c261
-rw-r--r--kernel/bpf/helpers.c4
-rw-r--r--kernel/bpf/inode.c17
-rw-r--r--kernel/bpf/lpm_trie.c527
-rw-r--r--kernel/bpf/stackmap.c2
-rw-r--r--kernel/bpf/syscall.c30
-rw-r--r--kernel/bpf/verifier.c362
-rw-r--r--kernel/cgroup/Makefile6
-rw-r--r--kernel/cgroup/cgroup-internal.h214
-rw-r--r--kernel/cgroup/cgroup-v1.c1398
-rw-r--r--kernel/cgroup/cgroup.c (renamed from kernel/cgroup.c)2093
-rw-r--r--kernel/cgroup/cpuset.c (renamed from kernel/cpuset.c)2
-rw-r--r--kernel/cgroup/freezer.c (renamed from kernel/cgroup_freezer.c)0
-rw-r--r--kernel/cgroup/namespace.c155
-rw-r--r--kernel/cgroup/pids.c (renamed from kernel/cgroup_pids.c)4
-rw-r--r--kernel/cgroup/rdma.c619
-rw-r--r--kernel/configs/android-base.config2
-rw-r--r--kernel/configs/android-recommended.config3
-rw-r--r--kernel/cpu.c32
-rw-r--r--kernel/cred.c1
-rw-r--r--kernel/debug/debug_core.c5
-rw-r--r--kernel/debug/gdbstub.c1
-rw-r--r--kernel/debug/kdb/kdb_bt.c3
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/events/callchain.c2
-rw-r--r--kernel/events/core.c92
-rw-r--r--kernel/events/uprobes.c30
-rw-r--r--kernel/exit.c41
-rw-r--r--kernel/extable.c6
-rw-r--r--kernel/fork.c87
-rw-r--r--kernel/futex.c26
-rw-r--r--kernel/hung_task.c3
-rw-r--r--kernel/irq/affinity.c20
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/jump_label.c153
-rw-r--r--kernel/kallsyms.c61
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/kexec_file.c8
-rw-r--r--kernel/kexec_internal.h6
-rw-r--r--kernel/kmod.c20
-rw-r--r--kernel/kprobes.c6
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/kthread.c5
-rw-r--r--kernel/latencytop.c2
-rw-r--r--kernel/locking/lockdep.c13
-rw-r--r--kernel/locking/lockdep_internals.h6
-rw-r--r--kernel/locking/locktorture.c2
-rw-r--r--kernel/locking/mutex.c4
-rw-r--r--kernel/locking/qspinlock_stat.h1
-rw-r--r--kernel/locking/rtmutex-debug.c1
-rw-r--r--kernel/locking/rtmutex.c4
-rw-r--r--kernel/locking/rtmutex_common.h1
-rw-r--r--kernel/locking/rwsem-spinlock.c19
-rw-r--r--kernel/locking/rwsem-xadd.c4
-rw-r--r--kernel/locking/rwsem.c1
-rw-r--r--kernel/locking/semaphore.c1
-rw-r--r--kernel/locking/test-ww_mutex.c6
-rw-r--r--kernel/memremap.c2
-rw-r--r--kernel/module.c42
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/padata.c5
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/power/hibernate.c96
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/snapshot.c5
-rw-r--r--kernel/power/suspend_test.c2
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk/Makefile2
-rw-r--r--kernel/printk/internal.h79
-rw-r--r--kernel/printk/printk.c235
-rw-r--r--kernel/printk/printk_safe.c (renamed from kernel/printk/nmi.c)234
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c17
-rw-r--r--kernel/rcu/rcuperf.c1
-rw-r--r--kernel/rcu/rcutorture.c3
-rw-r--r--kernel/rcu/srcu.c2
-rw-r--r--kernel/rcu/tiny.c14
-rw-r--r--kernel/rcu/tree.c4
-rw-r--r--kernel/rcu/tree.h1
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/rcu/update.c4
-rw-r--r--kernel/relay.c6
-rw-r--r--kernel/sched/autogroup.h1
-rw-r--r--kernel/sched/clock.c48
-rw-r--r--kernel/sched/completion.c3
-rw-r--r--kernel/sched/core.c86
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cpufreq_schedutil.c40
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/cputime.c6
-rw-r--r--kernel/sched/deadline.c98
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/fair.c31
-rw-r--r--kernel/sched/features.h5
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/loadavg.c21
-rw-r--r--kernel/sched/rt.c29
-rw-r--r--kernel/sched/sched.h24
-rw-r--r--kernel/sched/stats.h111
-rw-r--r--kernel/sched/swait.c2
-rw-r--r--kernel/sched/wait.c42
-rw-r--r--kernel/seccomp.c33
-rw-r--r--kernel/signal.c24
-rw-r--r--kernel/smp.c1
-rw-r--r--kernel/smpboot.c1
-rw-r--r--kernel/sys.c29
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/hrtimer.c4
-rw-r--r--kernel/time/itimer.c2
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/posix-cpu-timers.c3
-rw-r--r--kernel/time/posix-timers.c1
-rw-r--r--kernel/time/sched_clock.c1
-rw-r--r--kernel/time/tick-sched.c6
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/time/timer.c4
-rw-r--r--kernel/torture.c3
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/blktrace.c78
-rw-r--r--kernel/trace/bpf_trace.c82
-rw-r--r--kernel/trace/ftrace.c417
-rw-r--r--kernel/trace/ring_buffer.c9
-rw-r--r--kernel/trace/ring_buffer_benchmark.c1
-rw-r--r--kernel/trace/trace.c28
-rw-r--r--kernel/trace/trace.h85
-rw-r--r--kernel/trace/trace_benchmark.c4
-rw-r--r--kernel/trace/trace_branch.c83
-rw-r--r--kernel/trace/trace_clock.c1
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_events_hist.c1
-rw-r--r--kernel/trace/trace_events_trigger.c1
-rw-r--r--kernel/trace/trace_hwlat.c40
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_output.c58
-rw-r--r--kernel/trace/trace_probe.c50
-rw-r--r--kernel/trace/trace_probe.h4
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stack.c3
-rw-r--r--kernel/trace/trace_uprobe.c5
-rw-r--r--kernel/tracepoint.c3
-rw-r--r--kernel/tsacct.c4
-rw-r--r--kernel/ucount.c25
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/user_namespace.c1
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/utsname_sysctl.c1
-rw-r--r--kernel/watchdog.c3
-rw-r--r--kernel/watchdog_hld.c27
-rw-r--r--kernel/workqueue.c1
165 files changed, 6382 insertions, 3520 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 12c679f769c6..b302b4731d16 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -64,10 +64,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
-obj-$(CONFIG_CGROUPS) += cgroup.o
-obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
-obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
-obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUPS) += cgroup/
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/acct.c b/kernel/acct.c
index ca9cb55b5855..5b1284370367 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -56,6 +56,8 @@
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/uaccess.h>
+#include <linux/sched/cputime.h>
+
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
diff --git a/kernel/audit.c b/kernel/audit.c
index 6e399bb69d7c..a871bf80fde1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -54,6 +54,10 @@
#include <linux/kthread.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/mutex.h>
+#include <linux/gfp.h>
#include <linux/audit.h>
@@ -90,13 +94,34 @@ static u32 audit_default;
/* If auditing cannot proceed, audit_failure selects what happens. */
static u32 audit_failure = AUDIT_FAIL_PRINTK;
-/*
- * If audit records are to be written to the netlink socket, audit_pid
- * contains the pid of the auditd process and audit_nlk_portid contains
- * the portid to use to send netlink messages to that process.
+/* private audit network namespace index */
+static unsigned int audit_net_id;
+
+/**
+ * struct audit_net - audit private network namespace data
+ * @sk: communication socket
*/
-int audit_pid;
-static __u32 audit_nlk_portid;
+struct audit_net {
+ struct sock *sk;
+};
+
+/**
+ * struct auditd_connection - kernel/auditd connection state
+ * @pid: auditd PID
+ * @portid: netlink portid
+ * @net: the associated network namespace
+ * @lock: spinlock to protect write access
+ *
+ * Description:
+ * This struct is RCU protected; you must either hold the RCU lock for reading
+ * or the included spinlock for writing.
+ */
+static struct auditd_connection {
+ int pid;
+ u32 portid;
+ struct net *net;
+ spinlock_t lock;
+} auditd_conn;
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
@@ -121,11 +146,7 @@ u32 audit_sig_sid = 0;
3) suppressed due to audit_rate_limit
4) suppressed due to audit_backlog_limit
*/
-static atomic_t audit_lost = ATOMIC_INIT(0);
-
-/* The netlink socket. */
-static struct sock *audit_sock;
-static unsigned int audit_net_id;
+static atomic_t audit_lost = ATOMIC_INIT(0);
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -192,6 +213,43 @@ struct audit_reply {
struct sk_buff *skb;
};
+/**
+ * auditd_test_task - Check to see if a given task is an audit daemon
+ * @task: the task to check
+ *
+ * Description:
+ * Return 1 if the task is a registered audit daemon, 0 otherwise.
+ */
+int auditd_test_task(const struct task_struct *task)
+{
+ int rc;
+
+ rcu_read_lock();
+ rc = (auditd_conn.pid && task->tgid == auditd_conn.pid ? 1 : 0);
+ rcu_read_unlock();
+
+ return rc;
+}
+
+/**
+ * audit_get_sk - Return the audit socket for the given network namespace
+ * @net: the destination network namespace
+ *
+ * Description:
+ * Returns the sock pointer if valid, NULL otherwise. The caller must ensure
+ * that a reference is held for the network namespace while the sock is in use.
+ */
+static struct sock *audit_get_sk(const struct net *net)
+{
+ struct audit_net *aunet;
+
+ if (!net)
+ return NULL;
+
+ aunet = net_generic(net, audit_net_id);
+ return aunet->sk;
+}
+
static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
{
if (ab) {
@@ -210,9 +268,7 @@ void audit_panic(const char *message)
pr_err("%s\n", message);
break;
case AUDIT_FAIL_PANIC:
- /* test audit_pid since printk is always losey, why bother? */
- if (audit_pid)
- panic("audit: %s\n", message);
+ panic("audit: %s\n", message);
break;
}
}
@@ -370,21 +426,60 @@ static int audit_set_failure(u32 state)
return audit_do_config_change("audit_failure", &audit_failure, state);
}
-/*
- * For one reason or another this nlh isn't getting delivered to the userspace
- * audit daemon, just send it to printk.
+/**
+ * auditd_set - Set/Reset the auditd connection state
+ * @pid: auditd PID
+ * @portid: auditd netlink portid
+ * @net: auditd network namespace pointer
+ *
+ * Description:
+ * This function will obtain and drop network namespace references as
+ * necessary.
+ */
+static void auditd_set(int pid, u32 portid, struct net *net)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&auditd_conn.lock, flags);
+ auditd_conn.pid = pid;
+ auditd_conn.portid = portid;
+ if (auditd_conn.net)
+ put_net(auditd_conn.net);
+ if (net)
+ auditd_conn.net = get_net(net);
+ else
+ auditd_conn.net = NULL;
+ spin_unlock_irqrestore(&auditd_conn.lock, flags);
+}
+
+/**
+ * kauditd_print_skb - Print the audit record to the ring buffer
+ * @skb: audit record
+ *
+ * Whatever the reason, this packet may not make it to the auditd connection
+ * so write it via printk so the information isn't completely lost.
*/
static void kauditd_printk_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
char *data = nlmsg_data(nlh);
- if (nlh->nlmsg_type != AUDIT_EOE) {
- if (printk_ratelimit())
- pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
- else
- audit_log_lost("printk limit exceeded");
- }
+ if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit())
+ pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
+}
+
+/**
+ * kauditd_rehold_skb - Handle a audit record send failure in the hold queue
+ * @skb: audit record
+ *
+ * Description:
+ * This should only be used by the kauditd_thread when it fails to flush the
+ * hold queue.
+ */
+static void kauditd_rehold_skb(struct sk_buff *skb)
+{
+ /* put the record back in the queue at the same place */
+ skb_queue_head(&audit_hold_queue, skb);
}
/**
@@ -444,65 +539,163 @@ static void kauditd_retry_skb(struct sk_buff *skb)
* auditd_reset - Disconnect the auditd connection
*
* Description:
- * Break the auditd/kauditd connection and move all the records in the retry
- * queue into the hold queue in case auditd reconnects. The audit_cmd_mutex
- * must be held when calling this function.
+ * Break the auditd/kauditd connection and move all the queued records into the
+ * hold queue in case auditd reconnects.
*/
static void auditd_reset(void)
{
struct sk_buff *skb;
- /* break the connection */
- if (audit_sock) {
- sock_put(audit_sock);
- audit_sock = NULL;
- }
- audit_pid = 0;
- audit_nlk_portid = 0;
+ /* if it isn't already broken, break the connection */
+ rcu_read_lock();
+ if (auditd_conn.pid)
+ auditd_set(0, 0, NULL);
+ rcu_read_unlock();
- /* flush all of the retry queue to the hold queue */
+ /* flush all of the main and retry queues to the hold queue */
while ((skb = skb_dequeue(&audit_retry_queue)))
kauditd_hold_skb(skb);
+ while ((skb = skb_dequeue(&audit_queue)))
+ kauditd_hold_skb(skb);
}
/**
- * kauditd_send_unicast_skb - Send a record via unicast to auditd
+ * auditd_send_unicast_skb - Send a record via unicast to auditd
* @skb: audit record
+ *
+ * Description:
+ * Send a skb to the audit daemon, returns positive/zero values on success and
+ * negative values on failure; in all cases the skb will be consumed by this
+ * function. If the send results in -ECONNREFUSED the connection with auditd
+ * will be reset. This function may sleep so callers should not hold any locks
+ * where this would cause a problem.
*/
-static int kauditd_send_unicast_skb(struct sk_buff *skb)
+static int auditd_send_unicast_skb(struct sk_buff *skb)
{
int rc;
+ u32 portid;
+ struct net *net;
+ struct sock *sk;
+
+ /* NOTE: we can't call netlink_unicast while in the RCU section so
+ * take a reference to the network namespace and grab local
+ * copies of the namespace, the sock, and the portid; the
+ * namespace and sock aren't going to go away while we hold a
+ * reference and if the portid does become invalid after the RCU
+ * section netlink_unicast() should safely return an error */
+
+ rcu_read_lock();
+ if (!auditd_conn.pid) {
+ rcu_read_unlock();
+ rc = -ECONNREFUSED;
+ goto err;
+ }
+ net = auditd_conn.net;
+ get_net(net);
+ sk = audit_get_sk(net);
+ portid = auditd_conn.portid;
+ rcu_read_unlock();
- /* if we know nothing is connected, don't even try the netlink call */
- if (!audit_pid)
- return -ECONNREFUSED;
+ rc = netlink_unicast(sk, skb, portid, 0);
+ put_net(net);
+ if (rc < 0)
+ goto err;
- /* get an extra skb reference in case we fail to send */
- skb_get(skb);
- rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
- if (rc >= 0) {
- consume_skb(skb);
- rc = 0;
- }
+ return rc;
+err:
+ if (rc == -ECONNREFUSED)
+ auditd_reset();
return rc;
}
+/**
+ * kauditd_send_queue - Helper for kauditd_thread to flush skb queues
+ * @sk: the sending sock
+ * @portid: the netlink destination
+ * @queue: the skb queue to process
+ * @retry_limit: limit on number of netlink unicast failures
+ * @skb_hook: per-skb hook for additional processing
+ * @err_hook: hook called if the skb fails the netlink unicast send
+ *
+ * Description:
+ * Run through the given queue and attempt to send the audit records to auditd,
+ * returns zero on success, negative values on failure. It is up to the caller
+ * to ensure that the @sk is valid for the duration of this function.
+ *
+ */
+static int kauditd_send_queue(struct sock *sk, u32 portid,
+ struct sk_buff_head *queue,
+ unsigned int retry_limit,
+ void (*skb_hook)(struct sk_buff *skb),
+ void (*err_hook)(struct sk_buff *skb))
+{
+ int rc = 0;
+ struct sk_buff *skb;
+ static unsigned int failed = 0;
+
+ /* NOTE: kauditd_thread takes care of all our locking, we just use
+ * the netlink info passed to us (e.g. sk and portid) */
+
+ while ((skb = skb_dequeue(queue))) {
+ /* call the skb_hook for each skb we touch */
+ if (skb_hook)
+ (*skb_hook)(skb);
+
+ /* can we send to anyone via unicast? */
+ if (!sk) {
+ if (err_hook)
+ (*err_hook)(skb);
+ continue;
+ }
+
+ /* grab an extra skb reference in case of error */
+ skb_get(skb);
+ rc = netlink_unicast(sk, skb, portid, 0);
+ if (rc < 0) {
+ /* fatal failure for our queue flush attempt? */
+ if (++failed >= retry_limit ||
+ rc == -ECONNREFUSED || rc == -EPERM) {
+ /* yes - error processing for the queue */
+ sk = NULL;
+ if (err_hook)
+ (*err_hook)(skb);
+ if (!skb_hook)
+ goto out;
+ /* keep processing with the skb_hook */
+ continue;
+ } else
+ /* no - requeue to preserve ordering */
+ skb_queue_head(queue, skb);
+ } else {
+ /* it worked - drop the extra reference and continue */
+ consume_skb(skb);
+ failed = 0;
+ }
+ }
+
+out:
+ return (rc >= 0 ? 0 : rc);
+}
+
/*
* kauditd_send_multicast_skb - Send a record to any multicast listeners
* @skb: audit record
*
* Description:
- * This function doesn't consume an skb as might be expected since it has to
- * copy it anyways.
+ * Write a multicast message to anyone listening in the initial network
+ * namespace. This function doesn't consume an skb as might be expected since
+ * it has to copy it anyways.
*/
static void kauditd_send_multicast_skb(struct sk_buff *skb)
{
struct sk_buff *copy;
- struct audit_net *aunet = net_generic(&init_net, audit_net_id);
- struct sock *sock = aunet->nlsk;
+ struct sock *sock = audit_get_sk(&init_net);
struct nlmsghdr *nlh;
+ /* NOTE: we are not taking an additional reference for init_net since
+ * we don't have to worry about it going away */
+
if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
return;
@@ -526,149 +719,79 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb)
}
/**
- * kauditd_wake_condition - Return true when it is time to wake kauditd_thread
- *
- * Description:
- * This function is for use by the wait_event_freezable() call in
- * kauditd_thread().
+ * kauditd_thread - Worker thread to send audit records to userspace
+ * @dummy: unused
*/
-static int kauditd_wake_condition(void)
-{
- static int pid_last = 0;
- int rc;
- int pid = audit_pid;
-
- /* wake on new messages or a change in the connected auditd */
- rc = skb_queue_len(&audit_queue) || (pid && pid != pid_last);
- if (rc)
- pid_last = pid;
-
- return rc;
-}
-
static int kauditd_thread(void *dummy)
{
int rc;
- int auditd = 0;
- int reschedule = 0;
- struct sk_buff *skb;
- struct nlmsghdr *nlh;
+ u32 portid = 0;
+ struct net *net = NULL;
+ struct sock *sk = NULL;
#define UNICAST_RETRIES 5
-#define AUDITD_BAD(x,y) \
- ((x) == -ECONNREFUSED || (x) == -EPERM || ++(y) >= UNICAST_RETRIES)
-
- /* NOTE: we do invalidate the auditd connection flag on any sending
- * errors, but we only "restore" the connection flag at specific places
- * in the loop in order to help ensure proper ordering of audit
- * records */
set_freezable();
while (!kthread_should_stop()) {
- /* NOTE: possible area for future improvement is to look at
- * the hold and retry queues, since only this thread
- * has access to these queues we might be able to do
- * our own queuing and skip some/all of the locking */
-
- /* NOTE: it might be a fun experiment to split the hold and
- * retry queue handling to another thread, but the
- * synchronization issues and other overhead might kill
- * any performance gains */
+ /* NOTE: see the lock comments in auditd_send_unicast_skb() */
+ rcu_read_lock();
+ if (!auditd_conn.pid) {
+ rcu_read_unlock();
+ goto main_queue;
+ }
+ net = auditd_conn.net;
+ get_net(net);
+ sk = audit_get_sk(net);
+ portid = auditd_conn.portid;
+ rcu_read_unlock();
/* attempt to flush the hold queue */
- while (auditd && (skb = skb_dequeue(&audit_hold_queue))) {
- rc = kauditd_send_unicast_skb(skb);
- if (rc) {
- /* requeue to the same spot */
- skb_queue_head(&audit_hold_queue, skb);
-
- auditd = 0;
- if (AUDITD_BAD(rc, reschedule)) {
- mutex_lock(&audit_cmd_mutex);
- auditd_reset();
- mutex_unlock(&audit_cmd_mutex);
- reschedule = 0;
- }
- } else
- /* we were able to send successfully */
- reschedule = 0;
+ rc = kauditd_send_queue(sk, portid,
+ &audit_hold_queue, UNICAST_RETRIES,
+ NULL, kauditd_rehold_skb);
+ if (rc < 0) {
+ sk = NULL;
+ auditd_reset();
+ goto main_queue;
}
/* attempt to flush the retry queue */
- while (auditd && (skb = skb_dequeue(&audit_retry_queue))) {
- rc = kauditd_send_unicast_skb(skb);
- if (rc) {
- auditd = 0;
- if (AUDITD_BAD(rc, reschedule)) {
- kauditd_hold_skb(skb);
- mutex_lock(&audit_cmd_mutex);
- auditd_reset();
- mutex_unlock(&audit_cmd_mutex);
- reschedule = 0;
- } else
- /* temporary problem (we hope), queue
- * to the same spot and retry */
- skb_queue_head(&audit_retry_queue, skb);
- } else
- /* we were able to send successfully */
- reschedule = 0;
+ rc = kauditd_send_queue(sk, portid,
+ &audit_retry_queue, UNICAST_RETRIES,
+ NULL, kauditd_hold_skb);
+ if (rc < 0) {
+ sk = NULL;
+ auditd_reset();
+ goto main_queue;
}
- /* standard queue processing, try to be as quick as possible */
-quick_loop:
- skb = skb_dequeue(&audit_queue);
- if (skb) {
- /* setup the netlink header, see the comments in
- * kauditd_send_multicast_skb() for length quirks */
- nlh = nlmsg_hdr(skb);
- nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
-
- /* attempt to send to any multicast listeners */
- kauditd_send_multicast_skb(skb);
-
- /* attempt to send to auditd, queue on failure */
- if (auditd) {
- rc = kauditd_send_unicast_skb(skb);
- if (rc) {
- auditd = 0;
- if (AUDITD_BAD(rc, reschedule)) {
- mutex_lock(&audit_cmd_mutex);
- auditd_reset();
- mutex_unlock(&audit_cmd_mutex);
- reschedule = 0;
- }
-
- /* move to the retry queue */
- kauditd_retry_skb(skb);
- } else
- /* everything is working so go fast! */
- goto quick_loop;
- } else if (reschedule)
- /* we are currently having problems, move to
- * the retry queue */
- kauditd_retry_skb(skb);
- else
- /* dump the message via printk and hold it */
- kauditd_hold_skb(skb);
- } else {
- /* we have flushed the backlog so wake everyone */
- wake_up(&audit_backlog_wait);
-
- /* if everything is okay with auditd (if present), go
- * to sleep until there is something new in the queue
- * or we have a change in the connected auditd;
- * otherwise simply reschedule to give things a chance
- * to recover */
- if (reschedule) {
- set_current_state(TASK_INTERRUPTIBLE);
- schedule();
- } else
- wait_event_freezable(kauditd_wait,
- kauditd_wake_condition());
-
- /* update the auditd connection status */
- auditd = (audit_pid ? 1 : 0);
+main_queue:
+ /* process the main queue - do the multicast send and attempt
+ * unicast, dump failed record sends to the retry queue; if
+ * sk == NULL due to previous failures we will just do the
+ * multicast send and move the record to the retry queue */
+ rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
+ kauditd_send_multicast_skb,
+ kauditd_retry_skb);
+ if (sk == NULL || rc < 0)
+ auditd_reset();
+ sk = NULL;
+
+ /* drop our netns reference, no auditd sends past this line */
+ if (net) {
+ put_net(net);
+ net = NULL;
}
+
+ /* we have processed all the queues so wake everyone */
+ wake_up(&audit_backlog_wait);
+
+ /* NOTE: we want to wake up if there is anything on the queue,
+ * regardless of if an auditd is connected, as we need to
+ * do the multicast send and rotate records from the
+ * main queue to the retry/hold queues */
+ wait_event_freezable(kauditd_wait,
+ (skb_queue_len(&audit_queue) ? 1 : 0));
}
return 0;
@@ -678,17 +801,16 @@ int audit_send_list(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
- struct net *net = dest->net;
- struct audit_net *aunet = net_generic(net, audit_net_id);
+ struct sock *sk = audit_get_sk(dest->net);
/* wait for parent to finish and send an ACK */
mutex_lock(&audit_cmd_mutex);
mutex_unlock(&audit_cmd_mutex);
while ((skb = __skb_dequeue(&dest->q)) != NULL)
- netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
+ netlink_unicast(sk, skb, dest->portid, 0);
- put_net(net);
+ put_net(dest->net);
kfree(dest);
return 0;
@@ -722,16 +844,15 @@ out_kfree_skb:
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
- struct net *net = reply->net;
- struct audit_net *aunet = net_generic(net, audit_net_id);
+ struct sock *sk = audit_get_sk(reply->net);
mutex_lock(&audit_cmd_mutex);
mutex_unlock(&audit_cmd_mutex);
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
- netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
- put_net(net);
+ netlink_unicast(sk, reply->skb, reply->portid, 0);
+ put_net(reply->net);
kfree(reply);
return 0;
}
@@ -949,12 +1070,12 @@ static int audit_set_feature(struct sk_buff *skb)
static int audit_replace(pid_t pid)
{
- struct sk_buff *skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0,
- &pid, sizeof(pid));
+ struct sk_buff *skb;
+ skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, &pid, sizeof(pid));
if (!skb)
return -ENOMEM;
- return netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
+ return auditd_send_unicast_skb(skb);
}
static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
@@ -981,7 +1102,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memset(&s, 0, sizeof(s));
s.enabled = audit_enabled;
s.failure = audit_failure;
- s.pid = audit_pid;
+ rcu_read_lock();
+ s.pid = auditd_conn.pid;
+ rcu_read_unlock();
s.rate_limit = audit_rate_limit;
s.backlog_limit = audit_backlog_limit;
s.lost = atomic_read(&audit_lost);
@@ -1014,30 +1137,44 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
* from the initial pid namespace, but something
* to keep in mind if this changes */
int new_pid = s.pid;
+ pid_t auditd_pid;
pid_t requesting_pid = task_tgid_vnr(current);
- if ((!new_pid) && (requesting_pid != audit_pid)) {
- audit_log_config_change("audit_pid", new_pid, audit_pid, 0);
+ /* test the auditd connection */
+ audit_replace(requesting_pid);
+
+ rcu_read_lock();
+ auditd_pid = auditd_conn.pid;
+ /* only the current auditd can unregister itself */
+ if ((!new_pid) && (requesting_pid != auditd_pid)) {
+ rcu_read_unlock();
+ audit_log_config_change("audit_pid", new_pid,
+ auditd_pid, 0);
return -EACCES;
}
- if (audit_pid && new_pid &&
- audit_replace(requesting_pid) != -ECONNREFUSED) {
- audit_log_config_change("audit_pid", new_pid, audit_pid, 0);
+ /* replacing a healthy auditd is not allowed */
+ if (auditd_pid && new_pid) {
+ rcu_read_unlock();
+ audit_log_config_change("audit_pid", new_pid,
+ auditd_pid, 0);
return -EEXIST;
}
+ rcu_read_unlock();
+
if (audit_enabled != AUDIT_OFF)
- audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
+ audit_log_config_change("audit_pid", new_pid,
+ auditd_pid, 1);
+
if (new_pid) {
- if (audit_sock)
- sock_put(audit_sock);
- audit_pid = new_pid;
- audit_nlk_portid = NETLINK_CB(skb).portid;
- sock_hold(skb->sk);
- audit_sock = skb->sk;
- } else {
+ /* register a new auditd connection */
+ auditd_set(new_pid,
+ NETLINK_CB(skb).portid,
+ sock_net(NETLINK_CB(skb).sk));
+ /* try to process any backlog */
+ wake_up_interruptible(&kauditd_wait);
+ } else
+ /* unregister the auditd connection */
auditd_reset();
- }
- wake_up_interruptible(&kauditd_wait);
}
if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
err = audit_set_rate_limit(s.rate_limit);
@@ -1058,6 +1195,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err < 0)
return err;
}
+ if (s.mask == AUDIT_STATUS_LOST) {
+ u32 lost = atomic_xchg(&audit_lost, 0);
+
+ audit_log_config_change("lost", 0, lost, 1);
+ return lost;
+ }
break;
}
case AUDIT_GET_FEATURE:
@@ -1084,7 +1227,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
break;
}
- mutex_unlock(&audit_cmd_mutex);
audit_log_common_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY)
audit_log_format(ab, " msg='%.*s'",
@@ -1102,7 +1244,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
audit_set_portid(ab, NETLINK_CB(skb).portid);
audit_log_end(ab);
- mutex_lock(&audit_cmd_mutex);
}
break;
case AUDIT_ADD_RULE:
@@ -1292,26 +1433,26 @@ static int __net_init audit_net_init(struct net *net)
struct audit_net *aunet = net_generic(net, audit_net_id);
- aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
- if (aunet->nlsk == NULL) {
+ aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
+ if (aunet->sk == NULL) {
audit_panic("cannot initialize netlink socket in namespace");
return -ENOMEM;
}
- aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+
return 0;
}
static void __net_exit audit_net_exit(struct net *net)
{
struct audit_net *aunet = net_generic(net, audit_net_id);
- struct sock *sock = aunet->nlsk;
- mutex_lock(&audit_cmd_mutex);
- if (sock == audit_sock)
+
+ rcu_read_lock();
+ if (net == auditd_conn.net)
auditd_reset();
- mutex_unlock(&audit_cmd_mutex);
+ rcu_read_unlock();
- netlink_kernel_release(sock);
- aunet->nlsk = NULL;
+ netlink_kernel_release(aunet->sk);
}
static struct pernet_operations audit_net_ops __net_initdata = {
@@ -1329,27 +1470,33 @@ static int __init audit_init(void)
if (audit_initialized == AUDIT_DISABLED)
return 0;
- pr_info("initializing netlink subsys (%s)\n",
- audit_default ? "enabled" : "disabled");
- register_pernet_subsys(&audit_net_ops);
+ memset(&auditd_conn, 0, sizeof(auditd_conn));
+ spin_lock_init(&auditd_conn.lock);
skb_queue_head_init(&audit_queue);
skb_queue_head_init(&audit_retry_queue);
skb_queue_head_init(&audit_hold_queue);
- audit_initialized = AUDIT_INITIALIZED;
- audit_enabled = audit_default;
- audit_ever_enabled |= !!audit_default;
for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
INIT_LIST_HEAD(&audit_inode_hash[i]);
+ pr_info("initializing netlink subsys (%s)\n",
+ audit_default ? "enabled" : "disabled");
+ register_pernet_subsys(&audit_net_ops);
+
+ audit_initialized = AUDIT_INITIALIZED;
+ audit_enabled = audit_default;
+ audit_ever_enabled |= !!audit_default;
+
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
if (IS_ERR(kauditd_task)) {
int err = PTR_ERR(kauditd_task);
panic("audit: failed to start the kauditd thread (%d)\n", err);
}
- audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
+ audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL,
+ "state=initialized audit_enabled=%u res=1",
+ audit_enabled);
return 0;
}
@@ -1511,20 +1658,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
return NULL;
- /* don't ever fail/sleep on these two conditions:
+ /* NOTE: don't ever fail/sleep on these two conditions:
* 1. auditd generated record - since we need auditd to drain the
* queue; also, when we are checking for auditd, compare PIDs using
* task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
* using a PID anchored in the caller's namespace
- * 2. audit command message - record types 1000 through 1099 inclusive
- * are command messages/records used to manage the kernel subsystem
- * and the audit userspace, blocking on these messages could cause
- * problems under load so don't do it (note: not all of these
- * command types are valid as record types, but it is quicker to
- * just check two ints than a series of ints in a if/switch stmt) */
- if (!((audit_pid && audit_pid == task_tgid_vnr(current)) ||
- (type >= 1000 && type <= 1099))) {
- long sleep_time = audit_backlog_wait_time;
+ * 2. generator holding the audit_cmd_mutex - we don't want to block
+ * while holding the mutex */
+ if (!(auditd_test_task(current) ||
+ (current == __mutex_owner(&audit_cmd_mutex)))) {
+ long stime = audit_backlog_wait_time;
while (audit_backlog_limit &&
(skb_queue_len(&audit_queue) > audit_backlog_limit)) {
@@ -1533,14 +1676,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
/* sleep if we are allowed and we haven't exhausted our
* backlog wait limit */
- if ((gfp_mask & __GFP_DIRECT_RECLAIM) &&
- (sleep_time > 0)) {
+ if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) {
DECLARE_WAITQUEUE(wait, current);
add_wait_queue_exclusive(&audit_backlog_wait,
&wait);
set_current_state(TASK_UNINTERRUPTIBLE);
- sleep_time = schedule_timeout(sleep_time);
+ stime = schedule_timeout(stime);
remove_wait_queue(&audit_backlog_wait, &wait);
} else {
if (audit_rate_check() && printk_ratelimit())
@@ -2119,15 +2261,27 @@ out:
*/
void audit_log_end(struct audit_buffer *ab)
{
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+
if (!ab)
return;
- if (!audit_rate_check()) {
- audit_log_lost("rate limit exceeded");
- } else {
- skb_queue_tail(&audit_queue, ab->skb);
- wake_up_interruptible(&kauditd_wait);
+
+ if (audit_rate_check()) {
+ skb = ab->skb;
ab->skb = NULL;
- }
+
+ /* setup the netlink header, see the comments in
+ * kauditd_send_multicast_skb() for length quirks */
+ nlh = nlmsg_hdr(skb);
+ nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
+
+ /* queue the netlink packet and poke the kauditd thread */
+ skb_queue_tail(&audit_queue, skb);
+ wake_up_interruptible(&kauditd_wait);
+ } else
+ audit_log_lost("rate limit exceeded");
+
audit_buffer_free(ab);
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 960d49c9db5e..0d87f8ab8778 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -199,6 +199,9 @@ struct audit_context {
struct {
int argc;
} execve;
+ struct {
+ char *name;
+ } module;
};
int fds[2];
struct audit_proctitle proctitle;
@@ -215,7 +218,7 @@ extern void audit_log_name(struct audit_context *context,
struct audit_names *n, const struct path *path,
int record_num, int *call_panic);
-extern int audit_pid;
+extern int auditd_test_task(const struct task_struct *task);
#define AUDIT_INODE_BUCKETS 32
extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -247,10 +250,6 @@ struct audit_netlink_list {
int audit_send_list(void *);
-struct audit_net {
- struct sock *nlsk;
-};
-
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
@@ -334,14 +333,7 @@ extern u32 audit_sig_sid;
extern int audit_filter(int msgtype, unsigned int listtype);
#ifdef CONFIG_AUDITSYSCALL
-extern int __audit_signal_info(int sig, struct task_struct *t);
-static inline int audit_signal_info(int sig, struct task_struct *t)
-{
- if (unlikely((audit_pid && t->tgid == audit_pid) ||
- (audit_signals && !audit_dummy_context())))
- return __audit_signal_info(sig, t);
- return 0;
-}
+extern int audit_signal_info(int sig, struct task_struct *t);
extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
extern struct list_head *audit_killed_trees(void);
#else
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cf1fa43512c1..1c2333155893 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -762,7 +762,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
struct audit_entry *e;
enum audit_state state;
- if (audit_pid && tsk->tgid == audit_pid)
+ if (auditd_test_task(tsk))
return AUDIT_DISABLED;
rcu_read_lock();
@@ -816,7 +816,7 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
{
struct audit_names *n;
- if (audit_pid && tsk->tgid == audit_pid)
+ if (auditd_test_task(tsk))
return;
rcu_read_lock();
@@ -1221,7 +1221,7 @@ static void show_special(struct audit_context *context, int *call_panic)
context->ipc.perm_mode);
}
break; }
- case AUDIT_MQ_OPEN: {
+ case AUDIT_MQ_OPEN:
audit_log_format(ab,
"oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
"mq_msgsize=%ld mq_curmsgs=%ld",
@@ -1230,8 +1230,8 @@ static void show_special(struct audit_context *context, int *call_panic)
context->mq_open.attr.mq_maxmsg,
context->mq_open.attr.mq_msgsize,
context->mq_open.attr.mq_curmsgs);
- break; }
- case AUDIT_MQ_SENDRECV: {
+ break;
+ case AUDIT_MQ_SENDRECV:
audit_log_format(ab,
"mqdes=%d msg_len=%zd msg_prio=%u "
"abs_timeout_sec=%ld abs_timeout_nsec=%ld",
@@ -1240,12 +1240,12 @@ static void show_special(struct audit_context *context, int *call_panic)
context->mq_sendrecv.msg_prio,
context->mq_sendrecv.abs_timeout.tv_sec,
context->mq_sendrecv.abs_timeout.tv_nsec);
- break; }
- case AUDIT_MQ_NOTIFY: {
+ break;
+ case AUDIT_MQ_NOTIFY:
audit_log_format(ab, "mqdes=%d sigev_signo=%d",
context->mq_notify.mqdes,
context->mq_notify.sigev_signo);
- break; }
+ break;
case AUDIT_MQ_GETSETATTR: {
struct mq_attr *attr = &context->mq_getsetattr.mqstat;
audit_log_format(ab,
@@ -1255,19 +1255,24 @@ static void show_special(struct audit_context *context, int *call_panic)
attr->mq_flags, attr->mq_maxmsg,
attr->mq_msgsize, attr->mq_curmsgs);
break; }
- case AUDIT_CAPSET: {
+ case AUDIT_CAPSET:
audit_log_format(ab, "pid=%d", context->capset.pid);
audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
- break; }
- case AUDIT_MMAP: {
+ break;
+ case AUDIT_MMAP:
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
context->mmap.flags);
- break; }
- case AUDIT_EXECVE: {
+ break;
+ case AUDIT_EXECVE:
audit_log_execve_info(context, &ab);
- break; }
+ break;
+ case AUDIT_KERN_MODULE:
+ audit_log_format(ab, "name=");
+ audit_log_untrustedstring(ab, context->module.name);
+ kfree(context->module.name);
+ break;
}
audit_log_end(ab);
}
@@ -2244,26 +2249,27 @@ void __audit_ptrace(struct task_struct *t)
* If the audit subsystem is being terminated, record the task (pid)
* and uid that is doing that.
*/
-int __audit_signal_info(int sig, struct task_struct *t)
+int audit_signal_info(int sig, struct task_struct *t)
{
struct audit_aux_data_pids *axp;
struct task_struct *tsk = current;
struct audit_context *ctx = tsk->audit_context;
kuid_t uid = current_uid(), t_uid = task_uid(t);
- if (audit_pid && t->tgid == audit_pid) {
- if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = task_tgid_nr(tsk);
- if (uid_valid(tsk->loginuid))
- audit_sig_uid = tsk->loginuid;
- else
- audit_sig_uid = uid;
- security_task_getsecid(tsk, &audit_sig_sid);
- }
- if (!audit_signals || audit_dummy_context())
- return 0;
+ if (auditd_test_task(t) &&
+ (sig == SIGTERM || sig == SIGHUP ||
+ sig == SIGUSR1 || sig == SIGUSR2)) {
+ audit_sig_pid = task_tgid_nr(tsk);
+ if (uid_valid(tsk->loginuid))
+ audit_sig_uid = tsk->loginuid;
+ else
+ audit_sig_uid = uid;
+ security_task_getsecid(tsk, &audit_sig_sid);
}
+ if (!audit_signals || audit_dummy_context())
+ return 0;
+
/* optimize the common case by putting first signal recipient directly
* in audit_context */
if (!ctx->target_pid) {
@@ -2368,6 +2374,15 @@ void __audit_mmap_fd(int fd, int flags)
context->type = AUDIT_MMAP;
}
+void __audit_log_kern_module(char *name)
+{
+ struct audit_context *context = current->audit_context;
+
+ context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL);
+ strcpy(context->module.name, name);
+ context->type = AUDIT_KERN_MODULE;
+}
+
static void audit_log_task(struct audit_buffer *ab)
{
kuid_t auid, uid;
@@ -2411,7 +2426,7 @@ void audit_core_dumps(long signr)
if (unlikely(!ab))
return;
audit_log_task(ab);
- audit_log_format(ab, " sig=%ld", signr);
+ audit_log_format(ab, " sig=%ld res=1", signr);
audit_log_end(ab);
}
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1276474ac3cd..e1ce4f4fd7fd 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3d55d95dcf49..6b6f41f0b211 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -269,7 +269,7 @@ static const struct bpf_map_ops array_ops = {
.map_delete_elem = array_map_delete_elem,
};
-static struct bpf_map_type_list array_type __read_mostly = {
+static struct bpf_map_type_list array_type __ro_after_init = {
.ops = &array_ops,
.type = BPF_MAP_TYPE_ARRAY,
};
@@ -283,7 +283,7 @@ static const struct bpf_map_ops percpu_array_ops = {
.map_delete_elem = array_map_delete_elem,
};
-static struct bpf_map_type_list percpu_array_type __read_mostly = {
+static struct bpf_map_type_list percpu_array_type __ro_after_init = {
.ops = &percpu_array_ops,
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
};
@@ -409,7 +409,7 @@ static const struct bpf_map_ops prog_array_ops = {
.map_fd_put_ptr = prog_fd_array_put_ptr,
};
-static struct bpf_map_type_list prog_array_type __read_mostly = {
+static struct bpf_map_type_list prog_array_type __ro_after_init = {
.ops = &prog_array_ops,
.type = BPF_MAP_TYPE_PROG_ARRAY,
};
@@ -522,7 +522,7 @@ static const struct bpf_map_ops perf_event_array_ops = {
.map_release = perf_event_fd_array_release,
};
-static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+static struct bpf_map_type_list perf_event_array_type __ro_after_init = {
.ops = &perf_event_array_ops,
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
};
@@ -564,7 +564,7 @@ static const struct bpf_map_ops cgroup_array_ops = {
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
};
-static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+static struct bpf_map_type_list cgroup_array_type __ro_after_init = {
.ops = &cgroup_array_ops,
.type = BPF_MAP_TYPE_CGROUP_ARRAY,
};
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 89b7ef41c86b..f62d1d56f41d 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -213,11 +213,10 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
enum bpf_lru_list_type tgt_free_type)
{
struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
- struct bpf_lru_node *node, *tmp_node, *first_node;
+ struct bpf_lru_node *node, *tmp_node;
unsigned int nshrinked = 0;
unsigned int i = 0;
- first_node = list_first_entry(inactive, struct bpf_lru_node, list);
list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
if (bpf_lru_node_is_ref(node)) {
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
@@ -361,7 +360,8 @@ static void __local_list_add_pending(struct bpf_lru *lru,
list_add(&node->list, local_pending_list(loc_l));
}
-struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l)
+static struct bpf_lru_node *
+__local_list_pop_free(struct bpf_lru_locallist *loc_l)
{
struct bpf_lru_node *node;
@@ -374,8 +374,8 @@ struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l)
return node;
}
-struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru,
- struct bpf_lru_locallist *loc_l)
+static struct bpf_lru_node *
+__local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
{
struct bpf_lru_node *node;
bool force = false;
@@ -558,8 +558,9 @@ void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
bpf_common_lru_push_free(lru, node);
}
-void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
- u32 elem_size, u32 nr_elems)
+static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
+ u32 node_offset, u32 elem_size,
+ u32 nr_elems)
{
struct bpf_lru_list *l = &lru->common_lru.lru_list;
u32 i;
@@ -575,8 +576,9 @@ void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
}
}
-void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
- u32 elem_size, u32 nr_elems)
+static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
+ u32 node_offset, u32 elem_size,
+ u32 nr_elems)
{
u32 i, pcpu_entries;
int cpu;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 503d4211988a..b4f1cb0c5ac7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -28,6 +28,9 @@
#include <linux/moduleloader.h>
#include <linux/bpf.h>
#include <linux/frame.h>
+#include <linux/rbtree_latch.h>
+#include <linux/kallsyms.h>
+#include <linux/rcupdate.h>
#include <asm/unaligned.h>
@@ -95,6 +98,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
fp->aux = aux;
fp->aux->prog = fp;
+ INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
+
return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_alloc);
@@ -290,6 +295,206 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
}
#ifdef CONFIG_BPF_JIT
+static __always_inline void
+bpf_get_prog_addr_region(const struct bpf_prog *prog,
+ unsigned long *symbol_start,
+ unsigned long *symbol_end)
+{
+ const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
+ unsigned long addr = (unsigned long)hdr;
+
+ WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
+
+ *symbol_start = addr;
+ *symbol_end = addr + hdr->pages * PAGE_SIZE;
+}
+
+static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+{
+ BUILD_BUG_ON(sizeof("bpf_prog_") +
+ sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+
+ sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
+ sym = bin2hex(sym, prog->tag, sizeof(prog->tag));
+ *sym = 0;
+}
+
+static __always_inline unsigned long
+bpf_get_prog_addr_start(struct latch_tree_node *n)
+{
+ unsigned long symbol_start, symbol_end;
+ const struct bpf_prog_aux *aux;
+
+ aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
+ bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+
+ return symbol_start;
+}
+
+static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
+ struct latch_tree_node *b)
+{
+ return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b);
+}
+
+static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
+{
+ unsigned long val = (unsigned long)key;
+ unsigned long symbol_start, symbol_end;
+ const struct bpf_prog_aux *aux;
+
+ aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
+ bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+
+ if (val < symbol_start)
+ return -1;
+ if (val >= symbol_end)
+ return 1;
+
+ return 0;
+}
+
+static const struct latch_tree_ops bpf_tree_ops = {
+ .less = bpf_tree_less,
+ .comp = bpf_tree_comp,
+};
+
+static DEFINE_SPINLOCK(bpf_lock);
+static LIST_HEAD(bpf_kallsyms);
+static struct latch_tree_root bpf_tree __cacheline_aligned;
+
+int bpf_jit_kallsyms __read_mostly;
+
+static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
+{
+ WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
+ list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms);
+ latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
+}
+
+static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux)
+{
+ if (list_empty(&aux->ksym_lnode))
+ return;
+
+ latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
+ list_del_rcu(&aux->ksym_lnode);
+}
+
+static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
+{
+ return fp->jited && !bpf_prog_was_classic(fp);
+}
+
+static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
+{
+ return list_empty(&fp->aux->ksym_lnode) ||
+ fp->aux->ksym_lnode.prev == LIST_POISON2;
+}
+
+void bpf_prog_kallsyms_add(struct bpf_prog *fp)
+{
+ unsigned long flags;
+
+ if (!bpf_prog_kallsyms_candidate(fp) ||
+ !capable(CAP_SYS_ADMIN))
+ return;
+
+ spin_lock_irqsave(&bpf_lock, flags);
+ bpf_prog_ksym_node_add(fp->aux);
+ spin_unlock_irqrestore(&bpf_lock, flags);
+}
+
+void bpf_prog_kallsyms_del(struct bpf_prog *fp)
+{
+ unsigned long flags;
+
+ if (!bpf_prog_kallsyms_candidate(fp))
+ return;
+
+ spin_lock_irqsave(&bpf_lock, flags);
+ bpf_prog_ksym_node_del(fp->aux);
+ spin_unlock_irqrestore(&bpf_lock, flags);
+}
+
+static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
+{
+ struct latch_tree_node *n;
+
+ if (!bpf_jit_kallsyms_enabled())
+ return NULL;
+
+ n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
+ return n ?
+ container_of(n, struct bpf_prog_aux, ksym_tnode)->prog :
+ NULL;
+}
+
+const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
+ unsigned long *off, char *sym)
+{
+ unsigned long symbol_start, symbol_end;
+ struct bpf_prog *prog;
+ char *ret = NULL;
+
+ rcu_read_lock();
+ prog = bpf_prog_kallsyms_find(addr);
+ if (prog) {
+ bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end);
+ bpf_get_prog_name(prog, sym);
+
+ ret = sym;
+ if (size)
+ *size = symbol_end - symbol_start;
+ if (off)
+ *off = addr - symbol_start;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+bool is_bpf_text_address(unsigned long addr)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = bpf_prog_kallsyms_find(addr) != NULL;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *sym)
+{
+ unsigned long symbol_start, symbol_end;
+ struct bpf_prog_aux *aux;
+ unsigned int it = 0;
+ int ret = -ERANGE;
+
+ if (!bpf_jit_kallsyms_enabled())
+ return ret;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) {
+ if (it++ != symnum)
+ continue;
+
+ bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+ bpf_get_prog_name(aux->prog, sym);
+
+ *value = symbol_start;
+ *type = BPF_SYM_ELF_TYPE;
+
+ ret = 0;
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
unsigned int alignment,
@@ -326,6 +531,24 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
module_memfree(hdr);
}
+/* This symbol is only overridden by archs that have different
+ * requirements than the usual eBPF JITs, f.e. when they only
+ * implement cBPF JIT, do not set images read-only, etc.
+ */
+void __weak bpf_jit_free(struct bpf_prog *fp)
+{
+ if (fp->jited) {
+ struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
+
+ bpf_jit_binary_unlock_ro(hdr);
+ bpf_jit_binary_free(hdr);
+
+ WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
+ }
+
+ bpf_prog_unlock_free(fp);
+}
+
int bpf_jit_harden __read_mostly;
static int bpf_jit_blind_insn(const struct bpf_insn *from,
@@ -939,12 +1162,12 @@ out:
LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
off = IMM;
load_word:
- /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
- * only appearing in the programs where ctx ==
- * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
- * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
- * internal BPF verifier will check that BPF_R6 ==
- * ctx.
+ /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
+ * appearing in the programs where ctx == skb
+ * (see may_access_skb() in the verifier). All programs
+ * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6,
+ * bpf_convert_filter() saves it in BPF_R6, internal BPF
+ * verifier will check that BPF_R6 == ctx.
*
* BPF_ABS and BPF_IND are wrappers of function calls,
* so they scratch BPF_R1-BPF_R5 registers, preserve
@@ -1154,12 +1377,22 @@ const struct bpf_func_proto bpf_tail_call_proto = {
.arg3_type = ARG_ANYTHING,
};
-/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
+/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
+ * It is encouraged to implement bpf_int_jit_compile() instead, so that
+ * eBPF and implicitly also cBPF can get JITed!
+ */
struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
{
return prog;
}
+/* Stub for JITs that support eBPF. All cBPF code gets transformed into
+ * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
+ */
+void __weak bpf_jit_compile(struct bpf_prog *prog)
+{
+}
+
bool __weak bpf_helper_changes_pkt_data(void *func)
{
return false;
@@ -1173,3 +1406,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
{
return -EFAULT;
}
+
+/* All definitions of tracepoints related to BPF. */
+#define CREATE_TRACE_POINTS
+#include <linux/bpf_trace.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a753bbe7df0a..361a69dfe543 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -13,11 +13,12 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
+#include <linux/rculist_nulls.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
struct bucket {
- struct hlist_head head;
+ struct hlist_nulls_head head;
raw_spinlock_t lock;
};
@@ -29,28 +30,26 @@ struct bpf_htab {
struct pcpu_freelist freelist;
struct bpf_lru lru;
};
- void __percpu *extra_elems;
+ struct htab_elem *__percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
};
-enum extra_elem_state {
- HTAB_NOT_AN_EXTRA_ELEM = 0,
- HTAB_EXTRA_ELEM_FREE,
- HTAB_EXTRA_ELEM_USED
-};
-
/* each htab element is struct htab_elem + key + value */
struct htab_elem {
union {
- struct hlist_node hash_node;
- struct bpf_htab *htab;
- struct pcpu_freelist_node fnode;
+ struct hlist_nulls_node hash_node;
+ struct {
+ void *padding;
+ union {
+ struct bpf_htab *htab;
+ struct pcpu_freelist_node fnode;
+ };
+ };
};
union {
struct rcu_head rcu;
- enum extra_elem_state state;
struct bpf_lru_node lru_node;
};
u32 hash;
@@ -71,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}
+static bool htab_is_prealloc(const struct bpf_htab *htab)
+{
+ return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
+}
+
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
@@ -122,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
static int prealloc_init(struct bpf_htab *htab)
{
+ u32 num_entries = htab->map.max_entries;
int err = -ENOMEM, i;
- htab->elems = bpf_map_area_alloc(htab->elem_size *
- htab->map.max_entries);
+ if (!htab_is_percpu(htab) && !htab_is_lru(htab))
+ num_entries += num_possible_cpus();
+
+ htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
if (!htab->elems)
return -ENOMEM;
if (!htab_is_percpu(htab))
goto skip_percpu_elems;
- for (i = 0; i < htab->map.max_entries; i++) {
+ for (i = 0; i < num_entries; i++) {
u32 size = round_up(htab->map.value_size, 8);
void __percpu *pptr;
@@ -160,10 +167,11 @@ skip_percpu_elems:
if (htab_is_lru(htab))
bpf_lru_populate(&htab->lru, htab->elems,
offsetof(struct htab_elem, lru_node),
- htab->elem_size, htab->map.max_entries);
+ htab->elem_size, num_entries);
else
- pcpu_freelist_populate(&htab->freelist, htab->elems,
- htab->elem_size, htab->map.max_entries);
+ pcpu_freelist_populate(&htab->freelist,
+ htab->elems + offsetof(struct htab_elem, fnode),
+ htab->elem_size, num_entries);
return 0;
@@ -184,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab)
static int alloc_extra_elems(struct bpf_htab *htab)
{
- void __percpu *pptr;
+ struct htab_elem *__percpu *pptr, *l_new;
+ struct pcpu_freelist_node *l;
int cpu;
- pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
+ pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
+ GFP_USER | __GFP_NOWARN);
if (!pptr)
return -ENOMEM;
for_each_possible_cpu(cpu) {
- ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
- HTAB_EXTRA_ELEM_FREE;
+ l = pcpu_freelist_pop(&htab->freelist);
+ /* pop will succeed, since prealloc_init()
+ * preallocated extra num_possible_cpus elements
+ */
+ l_new = container_of(l, struct htab_elem, fnode);
+ *per_cpu_ptr(pptr, cpu) = l_new;
}
htab->extra_elems = pptr;
return 0;
@@ -217,6 +231,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
int err, i;
u64 cost;
+ BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
+ offsetof(struct htab_elem, hash_node.pprev));
+ BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
+ offsetof(struct htab_elem, hash_node.pprev));
+
if (lru && !capable(CAP_SYS_ADMIN))
/* LRU implementation is much complicated than other
* maps. Hence, limit to CAP_SYS_ADMIN for now.
@@ -326,29 +345,29 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
goto free_htab;
for (i = 0; i < htab->n_buckets; i++) {
- INIT_HLIST_HEAD(&htab->buckets[i].head);
+ INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
raw_spin_lock_init(&htab->buckets[i].lock);
}
- if (!percpu && !lru) {
- /* lru itself can remove the least used element, so
- * there is no need for an extra elem during map_update.
- */
- err = alloc_extra_elems(htab);
- if (err)
- goto free_buckets;
- }
-
if (prealloc) {
err = prealloc_init(htab);
if (err)
- goto free_extra_elems;
+ goto free_buckets;
+
+ if (!percpu && !lru) {
+ /* lru itself can remove the least used element, so
+ * there is no need for an extra elem during map_update.
+ */
+ err = alloc_extra_elems(htab);
+ if (err)
+ goto free_prealloc;
+ }
}
return &htab->map;
-free_extra_elems:
- free_percpu(htab->extra_elems);
+free_prealloc:
+ prealloc_destroy(htab);
free_buckets:
bpf_map_area_free(htab->buckets);
free_htab:
@@ -366,20 +385,44 @@ static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
return &htab->buckets[hash & (htab->n_buckets - 1)];
}
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
{
return &__select_bucket(htab, hash)->head;
}
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+/* this lookup function can only be called with bucket lock taken */
+static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
void *key, u32 key_size)
{
+ struct hlist_nulls_node *n;
+ struct htab_elem *l;
+
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+
+ return NULL;
+}
+
+/* can be called without bucket lock. it will repeat the loop in
+ * the unlikely event when elements moved from one bucket into another
+ * while link list is being walked
+ */
+static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
+ u32 hash, void *key,
+ u32 key_size, u32 n_buckets)
+{
+ struct hlist_nulls_node *n;
struct htab_elem *l;
- hlist_for_each_entry_rcu(l, head, hash_node)
+again:
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l->hash == hash && !memcmp(&l->key, key, key_size))
return l;
+ if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
+ goto again;
+
return NULL;
}
@@ -387,7 +430,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct htab_elem *l;
u32 hash, key_size;
@@ -400,7 +443,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
head = select_bucket(htab, hash);
- l = lookup_elem_raw(head, hash, key, key_size);
+ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
return l;
}
@@ -433,8 +476,9 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
{
struct bpf_htab *htab = (struct bpf_htab *)arg;
- struct htab_elem *l, *tgt_l;
- struct hlist_head *head;
+ struct htab_elem *l = NULL, *tgt_l;
+ struct hlist_nulls_head *head;
+ struct hlist_nulls_node *n;
unsigned long flags;
struct bucket *b;
@@ -444,9 +488,9 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
raw_spin_lock_irqsave(&b->lock, flags);
- hlist_for_each_entry_rcu(l, head, hash_node)
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
- hlist_del_rcu(&l->hash_node);
+ hlist_nulls_del_rcu(&l->hash_node);
break;
}
@@ -459,7 +503,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct htab_elem *l, *next_l;
u32 hash, key_size;
int i;
@@ -473,7 +517,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
head = select_bucket(htab, hash);
/* lookup the key */
- l = lookup_elem_raw(head, hash, key, key_size);
+ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
if (!l) {
i = 0;
@@ -481,7 +525,7 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
}
/* key was found, get next key in the same bucket */
- next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
struct htab_elem, hash_node);
if (next_l) {
@@ -500,7 +544,7 @@ find_first_elem:
head = select_bucket(htab, i);
/* pick first element in the bucket */
- next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
struct htab_elem, hash_node);
if (next_l) {
/* if it's not empty, just return it */
@@ -538,12 +582,7 @@ static void htab_elem_free_rcu(struct rcu_head *head)
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
- if (l->state == HTAB_EXTRA_ELEM_USED) {
- l->state = HTAB_EXTRA_ELEM_FREE;
- return;
- }
-
- if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
+ if (htab_is_prealloc(htab)) {
pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
@@ -573,43 +612,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
- bool old_elem_exists)
+ struct htab_elem *old_elem)
{
u32 size = htab->map.value_size;
- bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
- struct htab_elem *l_new;
+ bool prealloc = htab_is_prealloc(htab);
+ struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
- int err = 0;
if (prealloc) {
- l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
- if (!l_new)
- err = -E2BIG;
- } else {
- if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
- atomic_dec(&htab->count);
- err = -E2BIG;
+ if (old_elem) {
+ /* if we're updating the existing element,
+ * use per-cpu extra elems to avoid freelist_pop/push
+ */
+ pl_new = this_cpu_ptr(htab->extra_elems);
+ l_new = *pl_new;
+ *pl_new = old_elem;
} else {
- l_new = kmalloc(htab->elem_size,
- GFP_ATOMIC | __GFP_NOWARN);
- if (!l_new)
- return ERR_PTR(-ENOMEM);
- }
- }
+ struct pcpu_freelist_node *l;
- if (err) {
- if (!old_elem_exists)
- return ERR_PTR(err);
-
- /* if we're updating the existing element and the hash table
- * is full, use per-cpu extra elems
- */
- l_new = this_cpu_ptr(htab->extra_elems);
- if (l_new->state != HTAB_EXTRA_ELEM_FREE)
- return ERR_PTR(-E2BIG);
- l_new->state = HTAB_EXTRA_ELEM_USED;
+ l = pcpu_freelist_pop(&htab->freelist);
+ if (!l)
+ return ERR_PTR(-E2BIG);
+ l_new = container_of(l, struct htab_elem, fnode);
+ }
} else {
- l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
+ if (atomic_inc_return(&htab->count) > htab->map.max_entries)
+ if (!old_elem) {
+ /* when map is full and update() is replacing
+ * old element, it's ok to allocate, since
+ * old element will be freed immediately.
+ * Otherwise return an error
+ */
+ atomic_dec(&htab->count);
+ return ERR_PTR(-E2BIG);
+ }
+ l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+ if (!l_new)
+ return ERR_PTR(-ENOMEM);
}
memcpy(l_new->key, key, key_size);
@@ -661,7 +700,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -690,7 +729,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
goto err;
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
- !!l_old);
+ l_old);
if (IS_ERR(l_new)) {
/* all pre-allocated elements are in use or memory exhausted */
ret = PTR_ERR(l_new);
@@ -700,10 +739,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* add new element to the head of the list, so that
* concurrent search will find it before old elem
*/
- hlist_add_head_rcu(&l_new->hash_node, head);
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
- hlist_del_rcu(&l_old->hash_node);
- free_htab_elem(htab, l_old);
+ hlist_nulls_del_rcu(&l_old->hash_node);
+ if (!htab_is_prealloc(htab))
+ free_htab_elem(htab, l_old);
}
ret = 0;
err:
@@ -716,7 +756,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old = NULL;
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -757,10 +797,10 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
/* add new element to the head of the list, so that
* concurrent search will find it before old elem
*/
- hlist_add_head_rcu(&l_new->hash_node, head);
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
bpf_lru_node_set_ref(&l_new->lru_node);
- hlist_del_rcu(&l_old->hash_node);
+ hlist_nulls_del_rcu(&l_old->hash_node);
}
ret = 0;
@@ -781,7 +821,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -815,12 +855,12 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
value, onallcpus);
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
- hash, true, onallcpus, false);
+ hash, true, onallcpus, NULL);
if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new);
goto err;
}
- hlist_add_head_rcu(&l_new->hash_node, head);
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
}
ret = 0;
err:
@@ -834,7 +874,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
@@ -882,7 +922,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
} else {
pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
value, onallcpus);
- hlist_add_head_rcu(&l_new->hash_node, head);
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
l_new = NULL;
}
ret = 0;
@@ -910,7 +950,7 @@ static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct bucket *b;
struct htab_elem *l;
unsigned long flags;
@@ -930,7 +970,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
- hlist_del_rcu(&l->hash_node);
+ hlist_nulls_del_rcu(&l->hash_node);
free_htab_elem(htab, l);
ret = 0;
}
@@ -942,7 +982,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct bucket *b;
struct htab_elem *l;
unsigned long flags;
@@ -962,7 +1002,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
- hlist_del_rcu(&l->hash_node);
+ hlist_nulls_del_rcu(&l->hash_node);
ret = 0;
}
@@ -977,14 +1017,13 @@ static void delete_all_elements(struct bpf_htab *htab)
int i;
for (i = 0; i < htab->n_buckets; i++) {
- struct hlist_head *head = select_bucket(htab, i);
- struct hlist_node *n;
+ struct hlist_nulls_head *head = select_bucket(htab, i);
+ struct hlist_nulls_node *n;
struct htab_elem *l;
- hlist_for_each_entry_safe(l, n, head, hash_node) {
- hlist_del_rcu(&l->hash_node);
- if (l->state != HTAB_EXTRA_ELEM_USED)
- htab_elem_free(htab, l);
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ hlist_nulls_del_rcu(&l->hash_node);
+ htab_elem_free(htab, l);
}
}
}
@@ -1004,7 +1043,7 @@ static void htab_map_free(struct bpf_map *map)
* not have executed. Wait for them.
*/
rcu_barrier();
- if (htab->map.map_flags & BPF_F_NO_PREALLOC)
+ if (!htab_is_prealloc(htab))
delete_all_elements(htab);
else
prealloc_destroy(htab);
@@ -1023,7 +1062,7 @@ static const struct bpf_map_ops htab_ops = {
.map_delete_elem = htab_map_delete_elem,
};
-static struct bpf_map_type_list htab_type __read_mostly = {
+static struct bpf_map_type_list htab_type __ro_after_init = {
.ops = &htab_ops,
.type = BPF_MAP_TYPE_HASH,
};
@@ -1037,7 +1076,7 @@ static const struct bpf_map_ops htab_lru_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
};
-static struct bpf_map_type_list htab_lru_type __read_mostly = {
+static struct bpf_map_type_list htab_lru_type __ro_after_init = {
.ops = &htab_lru_ops,
.type = BPF_MAP_TYPE_LRU_HASH,
};
@@ -1124,7 +1163,7 @@ static const struct bpf_map_ops htab_percpu_ops = {
.map_delete_elem = htab_map_delete_elem,
};
-static struct bpf_map_type_list htab_percpu_type __read_mostly = {
+static struct bpf_map_type_list htab_percpu_type __ro_after_init = {
.ops = &htab_percpu_ops,
.type = BPF_MAP_TYPE_PERCPU_HASH,
};
@@ -1138,7 +1177,7 @@ static const struct bpf_map_ops htab_lru_percpu_ops = {
.map_delete_elem = htab_lru_map_delete_elem,
};
-static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = {
+static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = {
.ops = &htab_lru_percpu_ops,
.type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 045cbe673356..3d24e238221e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -176,6 +176,6 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.func = bpf_get_current_comm,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_RAW_STACK,
- .arg2_type = ARG_CONST_STACK_SIZE,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE,
};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 0b030c9126d3..fddcae801724 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -21,6 +21,7 @@
#include <linux/parser.h>
#include <linux/filter.h>
#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
enum bpf_type {
BPF_TYPE_UNSPEC = 0,
@@ -281,6 +282,13 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
ret = bpf_obj_do_pin(pname, raw, type);
if (ret != 0)
bpf_any_put(raw, type);
+ if ((trace_bpf_obj_pin_prog_enabled() ||
+ trace_bpf_obj_pin_map_enabled()) && !ret) {
+ if (type == BPF_TYPE_PROG)
+ trace_bpf_obj_pin_prog(raw, ufd, pname);
+ if (type == BPF_TYPE_MAP)
+ trace_bpf_obj_pin_map(raw, ufd, pname);
+ }
out:
putname(pname);
return ret;
@@ -342,8 +350,15 @@ int bpf_obj_get_user(const char __user *pathname)
else
goto out;
- if (ret < 0)
+ if (ret < 0) {
bpf_any_put(raw, type);
+ } else if (trace_bpf_obj_get_prog_enabled() ||
+ trace_bpf_obj_get_map_enabled()) {
+ if (type == BPF_TYPE_PROG)
+ trace_bpf_obj_get_prog(raw, ret, pname);
+ if (type == BPF_TYPE_MAP)
+ trace_bpf_obj_get_map(raw, ret, pname);
+ }
out:
putname(pname);
return ret;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
new file mode 100644
index 000000000000..b37bd9ab7f57
--- /dev/null
+++ b/kernel/bpf/lpm_trie.c
@@ -0,0 +1,527 @@
+/*
+ * Longest prefix match list implementation
+ *
+ * Copyright (c) 2016,2017 Daniel Mack
+ * Copyright (c) 2016 David Herrmann
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <net/ipv6.h>
+
+/* Intermediate node */
+#define LPM_TREE_NODE_FLAG_IM BIT(0)
+
+struct lpm_trie_node;
+
+struct lpm_trie_node {
+ struct rcu_head rcu;
+ struct lpm_trie_node __rcu *child[2];
+ u32 prefixlen;
+ u32 flags;
+ u8 data[0];
+};
+
+struct lpm_trie {
+ struct bpf_map map;
+ struct lpm_trie_node __rcu *root;
+ size_t n_entries;
+ size_t max_prefixlen;
+ size_t data_size;
+ raw_spinlock_t lock;
+};
+
+/* This trie implements a longest prefix match algorithm that can be used to
+ * match IP addresses to a stored set of ranges.
+ *
+ * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
+ * interpreted as big endian, so data[0] stores the most significant byte.
+ *
+ * Match ranges are internally stored in instances of struct lpm_trie_node
+ * which each contain their prefix length as well as two pointers that may
+ * lead to more nodes containing more specific matches. Each node also stores
+ * a value that is defined by and returned to userspace via the update_elem
+ * and lookup functions.
+ *
+ * For instance, let's start with a trie that was created with a prefix length
+ * of 32, so it can be used for IPv4 addresses, and one single element that
+ * matches 192.168.0.0/16. The data array would hence contain
+ * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
+ * stick to IP-address notation for readability though.
+ *
+ * As the trie is empty initially, the new node (1) will be places as root
+ * node, denoted as (R) in the example below. As there are no other node, both
+ * child pointers are %NULL.
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ *
+ * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
+ * a node with the same data and a smaller prefix (ie, a less specific one),
+ * node (2) will become a child of (1). In child index depends on the next bit
+ * that is outside of what (1) matches, and that bit is 0, so (2) will be
+ * child[0] of (1):
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * |
+ * +----------------+
+ * | (2) |
+ * | 192.168.0.0/24 |
+ * | value: 2 |
+ * | [0] [1] |
+ * +----------------+
+ *
+ * The child[1] slot of (1) could be filled with another node which has bit #17
+ * (the next bit after the ones that (1) matches on) set to 1. For instance,
+ * 192.168.128.0/24:
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * | |
+ * +----------------+ +------------------+
+ * | (2) | | (3) |
+ * | 192.168.0.0/24 | | 192.168.128.0/24 |
+ * | value: 2 | | value: 3 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +------------------+
+ *
+ * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
+ * it, node (1) is looked at first, and because (4) of the semantics laid out
+ * above (bit #17 is 0), it would normally be attached to (1) as child[0].
+ * However, that slot is already allocated, so a new node is needed in between.
+ * That node does not have a value attached to it and it will never be
+ * returned to users as result of a lookup. It is only there to differentiate
+ * the traversal further. It will get a prefix as wide as necessary to
+ * distinguish its two children:
+ *
+ * +----------------+
+ * | (1) (R) |
+ * | 192.168.0.0/16 |
+ * | value: 1 |
+ * | [0] [1] |
+ * +----------------+
+ * | |
+ * +----------------+ +------------------+
+ * | (4) (I) | | (3) |
+ * | 192.168.0.0/23 | | 192.168.128.0/24 |
+ * | value: --- | | value: 3 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +------------------+
+ * | |
+ * +----------------+ +----------------+
+ * | (2) | | (5) |
+ * | 192.168.0.0/24 | | 192.168.1.0/24 |
+ * | value: 2 | | value: 5 |
+ * | [0] [1] | | [0] [1] |
+ * +----------------+ +----------------+
+ *
+ * 192.168.1.1/32 would be a child of (5) etc.
+ *
+ * An intermediate node will be turned into a 'real' node on demand. In the
+ * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
+ *
+ * A fully populated trie would have a height of 32 nodes, as the trie was
+ * created with a prefix length of 32.
+ *
+ * The lookup starts at the root node. If the current node matches and if there
+ * is a child that can be used to become more specific, the trie is traversed
+ * downwards. The last node in the traversal that is a non-intermediate one is
+ * returned.
+ */
+
+static inline int extract_bit(const u8 *data, size_t index)
+{
+ return !!(data[index / 8] & (1 << (7 - (index % 8))));
+}
+
+/**
+ * longest_prefix_match() - determine the longest prefix
+ * @trie: The trie to get internal sizes from
+ * @node: The node to operate on
+ * @key: The key to compare to @node
+ *
+ * Determine the longest prefix of @node that matches the bits in @key.
+ */
+static size_t longest_prefix_match(const struct lpm_trie *trie,
+ const struct lpm_trie_node *node,
+ const struct bpf_lpm_trie_key *key)
+{
+ size_t prefixlen = 0;
+ size_t i;
+
+ for (i = 0; i < trie->data_size; i++) {
+ size_t b;
+
+ b = 8 - fls(node->data[i] ^ key->data[i]);
+ prefixlen += b;
+
+ if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
+ return min(node->prefixlen, key->prefixlen);
+
+ if (b < 8)
+ break;
+ }
+
+ return prefixlen;
+}
+
+/* Called from syscall or from eBPF program */
+static void *trie_lookup_elem(struct bpf_map *map, void *_key)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node *node, *found = NULL;
+ struct bpf_lpm_trie_key *key = _key;
+
+ /* Start walking the trie from the root node ... */
+
+ for (node = rcu_dereference(trie->root); node;) {
+ unsigned int next_bit;
+ size_t matchlen;
+
+ /* Determine the longest prefix of @node that matches @key.
+ * If it's the maximum possible prefix for this trie, we have
+ * an exact match and can return it directly.
+ */
+ matchlen = longest_prefix_match(trie, node, key);
+ if (matchlen == trie->max_prefixlen) {
+ found = node;
+ break;
+ }
+
+ /* If the number of bits that match is smaller than the prefix
+ * length of @node, bail out and return the node we have seen
+ * last in the traversal (ie, the parent).
+ */
+ if (matchlen < node->prefixlen)
+ break;
+
+ /* Consider this node as return candidate unless it is an
+ * artificially added intermediate one.
+ */
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ found = node;
+
+ /* If the node match is fully satisfied, let's see if we can
+ * become more specific. Determine the next bit in the key and
+ * traverse down.
+ */
+ next_bit = extract_bit(key->data, node->prefixlen);
+ node = rcu_dereference(node->child[next_bit]);
+ }
+
+ if (!found)
+ return NULL;
+
+ return found->data + trie->data_size;
+}
+
+static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
+ const void *value)
+{
+ struct lpm_trie_node *node;
+ size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
+
+ if (value)
+ size += trie->map.value_size;
+
+ node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
+ if (!node)
+ return NULL;
+
+ node->flags = 0;
+
+ if (value)
+ memcpy(node->data + trie->data_size, value,
+ trie->map.value_size);
+
+ return node;
+}
+
+/* Called from syscall or from eBPF program */
+static int trie_update_elem(struct bpf_map *map,
+ void *_key, void *value, u64 flags)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
+ struct lpm_trie_node __rcu **slot;
+ struct bpf_lpm_trie_key *key = _key;
+ unsigned long irq_flags;
+ unsigned int next_bit;
+ size_t matchlen = 0;
+ int ret = 0;
+
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ if (key->prefixlen > trie->max_prefixlen)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+ /* Allocate and fill a new node */
+
+ if (trie->n_entries == trie->map.max_entries) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ new_node = lpm_trie_node_alloc(trie, value);
+ if (!new_node) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ trie->n_entries++;
+
+ new_node->prefixlen = key->prefixlen;
+ RCU_INIT_POINTER(new_node->child[0], NULL);
+ RCU_INIT_POINTER(new_node->child[1], NULL);
+ memcpy(new_node->data, key->data, trie->data_size);
+
+ /* Now find a slot to attach the new node. To do that, walk the tree
+ * from the root and match as many bits as possible for each node until
+ * we either find an empty slot or a slot that needs to be replaced by
+ * an intermediate node.
+ */
+ slot = &trie->root;
+
+ while ((node = rcu_dereference_protected(*slot,
+ lockdep_is_held(&trie->lock)))) {
+ matchlen = longest_prefix_match(trie, node, key);
+
+ if (node->prefixlen != matchlen ||
+ node->prefixlen == key->prefixlen ||
+ node->prefixlen == trie->max_prefixlen)
+ break;
+
+ next_bit = extract_bit(key->data, node->prefixlen);
+ slot = &node->child[next_bit];
+ }
+
+ /* If the slot is empty (a free child pointer or an empty root),
+ * simply assign the @new_node to that slot and be done.
+ */
+ if (!node) {
+ rcu_assign_pointer(*slot, new_node);
+ goto out;
+ }
+
+ /* If the slot we picked already exists, replace it with @new_node
+ * which already has the correct data array set.
+ */
+ if (node->prefixlen == matchlen) {
+ new_node->child[0] = node->child[0];
+ new_node->child[1] = node->child[1];
+
+ if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ trie->n_entries--;
+
+ rcu_assign_pointer(*slot, new_node);
+ kfree_rcu(node, rcu);
+
+ goto out;
+ }
+
+ /* If the new node matches the prefix completely, it must be inserted
+ * as an ancestor. Simply insert it between @node and *@slot.
+ */
+ if (matchlen == key->prefixlen) {
+ next_bit = extract_bit(node->data, matchlen);
+ rcu_assign_pointer(new_node->child[next_bit], node);
+ rcu_assign_pointer(*slot, new_node);
+ goto out;
+ }
+
+ im_node = lpm_trie_node_alloc(trie, NULL);
+ if (!im_node) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ im_node->prefixlen = matchlen;
+ im_node->flags |= LPM_TREE_NODE_FLAG_IM;
+ memcpy(im_node->data, node->data, trie->data_size);
+
+ /* Now determine which child to install in which slot */
+ if (extract_bit(key->data, matchlen)) {
+ rcu_assign_pointer(im_node->child[0], node);
+ rcu_assign_pointer(im_node->child[1], new_node);
+ } else {
+ rcu_assign_pointer(im_node->child[0], new_node);
+ rcu_assign_pointer(im_node->child[1], node);
+ }
+
+ /* Finally, assign the intermediate node to the determined spot */
+ rcu_assign_pointer(*slot, im_node);
+
+out:
+ if (ret) {
+ if (new_node)
+ trie->n_entries--;
+
+ kfree(new_node);
+ kfree(im_node);
+ }
+
+ raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+ return ret;
+}
+
+static int trie_delete_elem(struct bpf_map *map, void *key)
+{
+ /* TODO */
+ return -ENOSYS;
+}
+
+#define LPM_DATA_SIZE_MAX 256
+#define LPM_DATA_SIZE_MIN 1
+
+#define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \
+ sizeof(struct lpm_trie_node))
+#define LPM_VAL_SIZE_MIN 1
+
+#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X))
+#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
+#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
+
+static struct bpf_map *trie_alloc(union bpf_attr *attr)
+{
+ struct lpm_trie *trie;
+ u64 cost = sizeof(*trie), cost_per_node;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 ||
+ attr->map_flags != BPF_F_NO_PREALLOC ||
+ attr->key_size < LPM_KEY_SIZE_MIN ||
+ attr->key_size > LPM_KEY_SIZE_MAX ||
+ attr->value_size < LPM_VAL_SIZE_MIN ||
+ attr->value_size > LPM_VAL_SIZE_MAX)
+ return ERR_PTR(-EINVAL);
+
+ trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
+ if (!trie)
+ return ERR_PTR(-ENOMEM);
+
+ /* copy mandatory map attributes */
+ trie->map.map_type = attr->map_type;
+ trie->map.key_size = attr->key_size;
+ trie->map.value_size = attr->value_size;
+ trie->map.max_entries = attr->max_entries;
+ trie->data_size = attr->key_size -
+ offsetof(struct bpf_lpm_trie_key, data);
+ trie->max_prefixlen = trie->data_size * 8;
+
+ cost_per_node = sizeof(struct lpm_trie_node) +
+ attr->value_size + trie->data_size;
+ cost += (u64) attr->max_entries * cost_per_node;
+ if (cost >= U32_MAX - PAGE_SIZE) {
+ ret = -E2BIG;
+ goto out_err;
+ }
+
+ trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ ret = bpf_map_precharge_memlock(trie->map.pages);
+ if (ret)
+ goto out_err;
+
+ raw_spin_lock_init(&trie->lock);
+
+ return &trie->map;
+out_err:
+ kfree(trie);
+ return ERR_PTR(ret);
+}
+
+static void trie_free(struct bpf_map *map)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct lpm_trie_node __rcu **slot;
+ struct lpm_trie_node *node;
+
+ raw_spin_lock(&trie->lock);
+
+ /* Always start at the root and walk down to a node that has no
+ * children. Then free that node, nullify its reference in the parent
+ * and start over.
+ */
+
+ for (;;) {
+ slot = &trie->root;
+
+ for (;;) {
+ node = rcu_dereference_protected(*slot,
+ lockdep_is_held(&trie->lock));
+ if (!node)
+ goto unlock;
+
+ if (rcu_access_pointer(node->child[0])) {
+ slot = &node->child[0];
+ continue;
+ }
+
+ if (rcu_access_pointer(node->child[1])) {
+ slot = &node->child[1];
+ continue;
+ }
+
+ kfree(node);
+ RCU_INIT_POINTER(*slot, NULL);
+ break;
+ }
+ }
+
+unlock:
+ raw_spin_unlock(&trie->lock);
+}
+
+static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static const struct bpf_map_ops trie_ops = {
+ .map_alloc = trie_alloc,
+ .map_free = trie_free,
+ .map_get_next_key = trie_get_next_key,
+ .map_lookup_elem = trie_lookup_elem,
+ .map_update_elem = trie_update_elem,
+ .map_delete_elem = trie_delete_elem,
+};
+
+static struct bpf_map_type_list trie_type __ro_after_init = {
+ .ops = &trie_ops,
+ .type = BPF_MAP_TYPE_LPM_TRIE,
+};
+
+static int __init register_trie_map(void)
+{
+ bpf_register_map_type(&trie_type);
+ return 0;
+}
+late_initcall(register_trie_map);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index be8519148c25..22aa45cd0324 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -273,7 +273,7 @@ static const struct bpf_map_ops stack_map_ops = {
.map_delete_elem = stack_map_delete_elem,
};
-static struct bpf_map_type_list stack_map_type __read_mostly = {
+static struct bpf_map_type_list stack_map_type __ro_after_init = {
.ops = &stack_map_ops,
.type = BPF_MAP_TYPE_STACK_TRACE,
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bbb016adbaeb..821f9e807de5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -10,8 +10,10 @@
* General Public License for more details.
*/
#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
+#include <linux/sched/signal.h>
#include <linux/vmalloc.h>
#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
@@ -241,6 +243,7 @@ static int map_create(union bpf_attr *attr)
/* failed to allocate fd */
goto free_map;
+ trace_bpf_map_create(map, err);
return err;
free_map:
@@ -365,6 +368,7 @@ static int map_lookup_elem(union bpf_attr *attr)
if (copy_to_user(uvalue, value, value_size) != 0)
goto free_value;
+ trace_bpf_map_lookup_elem(map, ufd, key, value);
err = 0;
free_value:
@@ -447,6 +451,8 @@ static int map_update_elem(union bpf_attr *attr)
__this_cpu_dec(bpf_prog_active);
preempt_enable();
+ if (!err)
+ trace_bpf_map_update_elem(map, ufd, key, value);
free_value:
kfree(value);
free_key:
@@ -492,6 +498,8 @@ static int map_delete_elem(union bpf_attr *attr)
__this_cpu_dec(bpf_prog_active);
preempt_enable();
+ if (!err)
+ trace_bpf_map_delete_elem(map, ufd, key);
free_key:
kfree(key);
err_put:
@@ -544,6 +552,7 @@ static int map_get_next_key(union bpf_attr *attr)
if (copy_to_user(unext_key, next_key, map->key_size) != 0)
goto free_next_key;
+ trace_bpf_map_next_key(map, ufd, key, next_key);
err = 0;
free_next_key:
@@ -608,6 +617,14 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
if (insn->imm == BPF_FUNC_xdp_adjust_head)
prog->xdp_adjust_head = 1;
if (insn->imm == BPF_FUNC_tail_call) {
+ /* If we tail call into other programs, we
+ * cannot make any assumptions since they
+ * can be replaced dynamically during runtime
+ * in the program array.
+ */
+ prog->cb_access = 1;
+ prog->xdp_adjust_head = 1;
+
/* mark bpf_tail_call as different opcode
* to avoid conditional branch in
* interpeter for every normal call
@@ -697,8 +714,11 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
void bpf_prog_put(struct bpf_prog *prog)
{
- if (atomic_dec_and_test(&prog->aux->refcnt))
+ if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ trace_bpf_prog_put_rcu(prog);
+ bpf_prog_kallsyms_del(prog);
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+ }
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -807,7 +827,11 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
{
- return __bpf_prog_get(ufd, &type);
+ struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
+
+ if (!IS_ERR(prog))
+ trace_bpf_prog_get_type(prog);
+ return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_get_type);
@@ -889,6 +913,8 @@ static int bpf_prog_load(union bpf_attr *attr)
/* failed to allocate fd */
goto free_used_maps;
+ bpf_prog_kallsyms_add(prog);
+ trace_bpf_prog_load(prog, err);
return err;
free_used_maps:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cdc43b899f28..a834068a400e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -33,7 +33,7 @@
* - out of bounds or malformed jumps
* The second pass is all possible path descent from the 1st insn.
* Since it's analyzing all pathes through the program, the length of the
- * analysis is limited to 32k insn, which may be hit even if total number of
+ * analysis is limited to 64k insn, which may be hit even if total number of
* insn is less then 4K, but there are too many branches that change stack/regs.
* Number of 'branches to be analyzed' is limited to 1k
*
@@ -481,6 +481,13 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
}
+static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
+ u32 regno)
+{
+ mark_reg_unknown_value(regs, regno);
+ reset_reg_range_values(regs, regno);
+}
+
enum reg_arg_type {
SRC_OP, /* register is used as source operand */
DST_OP, /* register is used as destination operand */
@@ -532,6 +539,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
switch (type) {
case PTR_TO_MAP_VALUE:
case PTR_TO_MAP_VALUE_OR_NULL:
+ case PTR_TO_MAP_VALUE_ADJ:
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
@@ -616,7 +624,8 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
}
if (value_regno >= 0)
/* have read misc data from the stack */
- mark_reg_unknown_value(state->regs, value_regno);
+ mark_reg_unknown_value_and_range(state->regs,
+ value_regno);
return 0;
}
}
@@ -627,7 +636,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
{
struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
- if (off < 0 || off + size > map->value_size) {
+ if (off < 0 || size <= 0 || off + size > map->value_size) {
verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
map->value_size, off, size);
return -EACCES;
@@ -635,6 +644,51 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
return 0;
}
+/* check read/write into an adjusted map element */
+static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno,
+ int off, int size)
+{
+ struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_reg_state *reg = &state->regs[regno];
+ int err;
+
+ /* We adjusted the register to this map value, so we
+ * need to change off and size to min_value and max_value
+ * respectively to make sure our theoretical access will be
+ * safe.
+ */
+ if (log_level)
+ print_verifier_state(state);
+ env->varlen_map_value_access = true;
+ /* The minimum value is only important with signed
+ * comparisons where we can't assume the floor of a
+ * value is 0. If we are using signed variables for our
+ * index'es we need to make sure that whatever we use
+ * will have a set floor within our range.
+ */
+ if (reg->min_value < 0) {
+ verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_map_access(env, regno, reg->min_value + off, size);
+ if (err) {
+ verbose("R%d min value is outside of the array range\n",
+ regno);
+ return err;
+ }
+
+ /* If we haven't set a max value then we need to bail
+ * since we can't be sure we won't do bad things.
+ */
+ if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+ verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+ regno);
+ return -EACCES;
+ }
+ return check_map_access(env, regno, reg->max_value + off, size);
+}
+
#define MAX_PACKET_OFF 0xffff
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
@@ -647,6 +701,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
/* dst_input() and dst_output() can't write for now */
if (t == BPF_WRITE)
return false;
+ /* fallthrough */
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
case BPF_PROG_TYPE_XDP:
@@ -710,38 +765,56 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
}
}
-static int check_ptr_alignment(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg, int off, int size)
+static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+ int off, int size)
{
- if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) {
- if (off % size != 0) {
- verbose("misaligned access off %d size %d\n",
- off, size);
- return -EACCES;
- } else {
- return 0;
- }
- }
-
- if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
- /* misaligned access to packet is ok on x86,arm,arm64 */
- return 0;
-
if (reg->id && size != 1) {
- verbose("Unknown packet alignment. Only byte-sized access allowed\n");
+ verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n");
return -EACCES;
}
/* skb->data is NET_IP_ALIGN-ed */
- if (reg->type == PTR_TO_PACKET &&
- (NET_IP_ALIGN + reg->off + off) % size != 0) {
+ if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
verbose("misaligned packet access off %d+%d+%d size %d\n",
NET_IP_ALIGN, reg->off, off, size);
return -EACCES;
}
+
+ return 0;
+}
+
+static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
+ int size)
+{
+ if (size != 1) {
+ verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
+ return -EACCES;
+ }
+
return 0;
}
+static int check_ptr_alignment(const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ switch (reg->type) {
+ case PTR_TO_PACKET:
+ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
+ check_pkt_ptr_alignment(reg, off, size);
+ case PTR_TO_MAP_VALUE_ADJ:
+ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
+ check_val_ptr_alignment(reg, size);
+ default:
+ if (off % size != 0) {
+ verbose("misaligned access off %d size %d\n",
+ off, size);
+ return -EACCES;
+ }
+
+ return 0;
+ }
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -763,7 +836,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
if (size < 0)
return size;
- err = check_ptr_alignment(env, reg, off, size);
+ err = check_ptr_alignment(reg, off, size);
if (err)
return err;
@@ -775,47 +848,13 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
return -EACCES;
}
- /* If we adjusted the register to this map value at all then we
- * need to change off and size to min_value and max_value
- * respectively to make sure our theoretical access will be
- * safe.
- */
- if (reg->type == PTR_TO_MAP_VALUE_ADJ) {
- if (log_level)
- print_verifier_state(state);
- env->varlen_map_value_access = true;
- /* The minimum value is only important with signed
- * comparisons where we can't assume the floor of a
- * value is 0. If we are using signed variables for our
- * index'es we need to make sure that whatever we use
- * will have a set floor within our range.
- */
- if (reg->min_value < 0) {
- verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
- regno);
- return -EACCES;
- }
- err = check_map_access(env, regno, reg->min_value + off,
- size);
- if (err) {
- verbose("R%d min value is outside of the array range\n",
- regno);
- return err;
- }
-
- /* If we haven't set a max value then we need to bail
- * since we can't be sure we won't do bad things.
- */
- if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
- verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
- regno);
- return -EACCES;
- }
- off += reg->max_value;
- }
- err = check_map_access(env, regno, off, size);
+ if (reg->type == PTR_TO_MAP_VALUE_ADJ)
+ err = check_map_access_adj(env, regno, off, size);
+ else
+ err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value(state->regs, value_regno);
+ mark_reg_unknown_value_and_range(state->regs,
+ value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = UNKNOWN_VALUE;
@@ -827,7 +866,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
}
err = check_ctx_access(env, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
- mark_reg_unknown_value(state->regs, value_regno);
+ mark_reg_unknown_value_and_range(state->regs,
+ value_regno);
/* note that reg.[id|off|range] == 0 */
state->regs[value_regno].type = reg_type;
}
@@ -860,7 +900,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
}
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown_value(state->regs, value_regno);
+ mark_reg_unknown_value_and_range(state->regs,
+ value_regno);
} else {
verbose("R%d invalid mem access '%s'\n",
regno, reg_type_str[reg->type]);
@@ -958,6 +999,25 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return 0;
}
+static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
+ int access_size, bool zero_size_allowed,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+
+ switch (regs[regno].type) {
+ case PTR_TO_PACKET:
+ return check_packet_access(env, regno, 0, access_size);
+ case PTR_TO_MAP_VALUE:
+ return check_map_access(env, regno, 0, access_size);
+ case PTR_TO_MAP_VALUE_ADJ:
+ return check_map_access_adj(env, regno, 0, access_size);
+ default: /* const_imm|ptr_to_stack or invalid ptr */
+ return check_stack_boundary(env, regno, access_size,
+ zero_size_allowed, meta);
+ }
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
struct bpf_call_arg_meta *meta)
@@ -993,10 +1053,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
expected_type = PTR_TO_STACK;
if (type != PTR_TO_PACKET && type != expected_type)
goto err_type;
- } else if (arg_type == ARG_CONST_STACK_SIZE ||
- arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
+ } else if (arg_type == ARG_CONST_SIZE ||
+ arg_type == ARG_CONST_SIZE_OR_ZERO) {
expected_type = CONST_IMM;
- if (type != expected_type)
+ /* One exception. Allow UNKNOWN_VALUE registers when the
+ * boundaries are known and don't cause unsafe memory accesses
+ */
+ if (type != UNKNOWN_VALUE && type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_MAP_PTR) {
expected_type = CONST_PTR_TO_MAP;
@@ -1006,8 +1069,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
expected_type = PTR_TO_CTX;
if (type != expected_type)
goto err_type;
- } else if (arg_type == ARG_PTR_TO_STACK ||
- arg_type == ARG_PTR_TO_RAW_STACK) {
+ } else if (arg_type == ARG_PTR_TO_MEM ||
+ arg_type == ARG_PTR_TO_UNINIT_MEM) {
expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be
* passed in as argument, it's a CONST_IMM type. Final test
@@ -1015,9 +1078,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (type == CONST_IMM && reg->imm == 0)
/* final test in check_stack_boundary() */;
- else if (type != PTR_TO_PACKET && type != expected_type)
+ else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+ type != PTR_TO_MAP_VALUE_ADJ && type != expected_type)
goto err_type;
- meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
+ meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
} else {
verbose("unsupported arg_type %d\n", arg_type);
return -EFAULT;
@@ -1063,9 +1127,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
err = check_stack_boundary(env, regno,
meta->map_ptr->value_size,
false, NULL);
- } else if (arg_type == ARG_CONST_STACK_SIZE ||
- arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
- bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
+ } else if (arg_type == ARG_CONST_SIZE ||
+ arg_type == ARG_CONST_SIZE_OR_ZERO) {
+ bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
/* bpf_xxx(..., buf, len) call will access 'len' bytes
* from stack pointer 'buf'. Check it
@@ -1073,14 +1137,50 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (regno == 0) {
/* kernel subsystem misconfigured verifier */
- verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+ verbose("ARG_CONST_SIZE cannot be first argument\n");
return -EACCES;
}
- if (regs[regno - 1].type == PTR_TO_PACKET)
- err = check_packet_access(env, regno - 1, 0, reg->imm);
- else
- err = check_stack_boundary(env, regno - 1, reg->imm,
- zero_size_allowed, meta);
+
+ /* If the register is UNKNOWN_VALUE, the access check happens
+ * using its boundaries. Otherwise, just use its imm
+ */
+ if (type == UNKNOWN_VALUE) {
+ /* For unprivileged variable accesses, disable raw
+ * mode so that the program is required to
+ * initialize all the memory that the helper could
+ * just partially fill up.
+ */
+ meta = NULL;
+
+ if (reg->min_value < 0) {
+ verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (reg->min_value == 0) {
+ err = check_helper_mem_access(env, regno - 1, 0,
+ zero_size_allowed,
+ meta);
+ if (err)
+ return err;
+ }
+
+ if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+ verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1,
+ reg->max_value,
+ zero_size_allowed, meta);
+ if (err)
+ return err;
+ } else {
+ /* register is CONST_IMM */
+ err = check_helper_mem_access(env, regno - 1, reg->imm,
+ zero_size_allowed, meta);
+ }
}
return err;
@@ -1154,15 +1254,15 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
{
int count = 0;
- if (fn->arg1_type == ARG_PTR_TO_RAW_STACK)
+ if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM)
count++;
- if (fn->arg2_type == ARG_PTR_TO_RAW_STACK)
+ if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM)
count++;
- if (fn->arg3_type == ARG_PTR_TO_RAW_STACK)
+ if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM)
count++;
- if (fn->arg4_type == ARG_PTR_TO_RAW_STACK)
+ if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM)
count++;
- if (fn->arg5_type == ARG_PTR_TO_RAW_STACK)
+ if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
count++;
return count > 1 ? -EINVAL : 0;
@@ -1316,7 +1416,7 @@ static int check_packet_ptr_add(struct bpf_verifier_env *env,
imm = insn->imm;
add_imm:
- if (imm <= 0) {
+ if (imm < 0) {
verbose("addition of negative constant to packet pointer is not allowed\n");
return -EACCES;
}
@@ -1485,22 +1585,54 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
struct bpf_reg_state *src_reg = &regs[insn->src_reg];
u8 opcode = BPF_OP(insn->code);
+ u64 dst_imm = dst_reg->imm;
- /* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or'
- * insn. Don't care about overflow or negative values, just add them
+ /* dst_reg->type == CONST_IMM here. Simulate execution of insns
+ * containing ALU ops. Don't care about overflow or negative
+ * values, just add/sub/... them; registers are in u64.
*/
- if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K)
- dst_reg->imm += insn->imm;
- else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM)
- dst_reg->imm += src_reg->imm;
- else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K)
- dst_reg->imm |= insn->imm;
- else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
- src_reg->type == CONST_IMM)
- dst_reg->imm |= src_reg->imm;
- else
+ if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm += insn->imm;
+ } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm += src_reg->imm;
+ } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm -= insn->imm;
+ } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm -= src_reg->imm;
+ } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm *= insn->imm;
+ } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm *= src_reg->imm;
+ } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm |= insn->imm;
+ } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm |= src_reg->imm;
+ } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm &= insn->imm;
+ } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm &= src_reg->imm;
+ } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm >>= insn->imm;
+ } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm >>= src_reg->imm;
+ } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) {
+ dst_imm <<= insn->imm;
+ } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM) {
+ dst_imm <<= src_reg->imm;
+ } else {
mark_reg_unknown_value(regs, insn->dst_reg);
+ goto out;
+ }
+
+ dst_reg->imm = dst_imm;
+out:
return 0;
}
@@ -1811,6 +1943,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* register as unknown.
*/
if (env->allow_ptr_leaks &&
+ BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
(dst_reg->type == PTR_TO_MAP_VALUE ||
dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
@@ -1859,14 +1992,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
- regs[i].range = dst_reg->off;
+ /* keep the maximum range already checked */
+ regs[i].range = max(regs[i].range, dst_reg->off);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
- reg->range = dst_reg->off;
+ reg->range = max(reg->range, dst_reg->off);
}
}
@@ -1894,6 +2028,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
case BPF_JGT:
/* Unsigned comparison, the minimum value is 0. */
false_reg->min_value = 0;
+ /* fallthrough */
case BPF_JSGT:
/* If this is false then we know the maximum val is val,
* otherwise we know the min val is val+1.
@@ -1904,6 +2039,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
case BPF_JGE:
/* Unsigned comparison, the minimum value is 0. */
false_reg->min_value = 0;
+ /* fallthrough */
case BPF_JSGE:
/* If this is false then we know the maximum value is val - 1,
* otherwise we know the mimimum value is val.
@@ -1942,6 +2078,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
case BPF_JGT:
/* Unsigned comparison, the minimum value is 0. */
true_reg->min_value = 0;
+ /* fallthrough */
case BPF_JSGT:
/*
* If this is false, then the val is <= the register, if it is
@@ -1953,6 +2090,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
case BPF_JGE:
/* Unsigned comparison, the minimum value is 0. */
true_reg->min_value = 0;
+ /* fallthrough */
case BPF_JSGE:
/* If this is false then constant < register, if it is true then
* the register < constant.
@@ -2144,14 +2282,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
if (insn->src_reg == 0) {
- /* generic move 64-bit immediate into a register,
- * only analyzer needs to collect the ld_imm value.
- */
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- if (!env->analyzer_ops)
- return 0;
-
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = imm;
return 0;
@@ -2664,7 +2796,7 @@ static int do_check(struct bpf_verifier_env *env)
class = BPF_CLASS(insn->code);
if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
- verbose("BPF program is too large. Proccessed %d insn\n",
+ verbose("BPF program is too large. Processed %d insn\n",
insn_processed);
return -E2BIG;
}
@@ -2729,7 +2861,6 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- reset_reg_range_values(regs, insn->dst_reg);
if (BPF_SIZE(insn->code) != BPF_W &&
BPF_SIZE(insn->code) != BPF_DW) {
insn_idx++;
@@ -3085,10 +3216,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn = env->prog->insnsi + delta;
for (i = 0; i < insn_cnt; i++, insn++) {
- if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+ if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
type = BPF_READ;
- else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
insn->code == (BPF_STX | BPF_MEM | BPF_DW))
type = BPF_WRITE;
else
@@ -3097,8 +3232,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
continue;
- cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
- insn->off, insn_buf, env->prog);
+ cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
new file mode 100644
index 000000000000..387348a40c64
--- /dev/null
+++ b/kernel/cgroup/Makefile
@@ -0,0 +1,6 @@
+obj-y := cgroup.o namespace.o cgroup-v1.o
+
+obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += pids.o
+obj-$(CONFIG_CGROUP_RDMA) += rdma.o
+obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
new file mode 100644
index 000000000000..9203bfb05603
--- /dev/null
+++ b/kernel/cgroup/cgroup-internal.h
@@ -0,0 +1,214 @@
+#ifndef __CGROUP_INTERNAL_H
+#define __CGROUP_INTERNAL_H
+
+#include <linux/cgroup.h>
+#include <linux/kernfs.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies. In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+ /* the cgroup and css_set this link associates */
+ struct cgroup *cgrp;
+ struct css_set *cset;
+
+ /* list of cgrp_cset_links anchored at cgrp->cset_links */
+ struct list_head cset_link;
+
+ /* list of cgrp_cset_links anchored at css_set->cgrp_links */
+ struct list_head cgrp_link;
+};
+
+/* used to track tasks and csets during migration */
+struct cgroup_taskset {
+ /* the src and dst cset list running through cset->mg_node */
+ struct list_head src_csets;
+ struct list_head dst_csets;
+
+ /* the subsys currently being processed */
+ int ssid;
+
+ /*
+ * Fields for cgroup_taskset_*() iteration.
+ *
+ * Before migration is committed, the target migration tasks are on
+ * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
+ * the csets on ->dst_csets. ->csets point to either ->src_csets
+ * or ->dst_csets depending on whether migration is committed.
+ *
+ * ->cur_csets and ->cur_task point to the current task position
+ * during iteration.
+ */
+ struct list_head *csets;
+ struct css_set *cur_cset;
+ struct task_struct *cur_task;
+};
+
+/* migration context also tracks preloading */
+struct cgroup_mgctx {
+ /*
+ * Preloaded source and destination csets. Used to guarantee
+ * atomic success or failure on actual migration.
+ */
+ struct list_head preloaded_src_csets;
+ struct list_head preloaded_dst_csets;
+
+ /* tasks and csets to migrate */
+ struct cgroup_taskset tset;
+
+ /* subsystems affected by migration */
+ u16 ss_mask;
+};
+
+#define CGROUP_TASKSET_INIT(tset) \
+{ \
+ .src_csets = LIST_HEAD_INIT(tset.src_csets), \
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
+ .csets = &tset.src_csets, \
+}
+
+#define CGROUP_MGCTX_INIT(name) \
+{ \
+ LIST_HEAD_INIT(name.preloaded_src_csets), \
+ LIST_HEAD_INIT(name.preloaded_dst_csets), \
+ CGROUP_TASKSET_INIT(name.tset), \
+}
+
+#define DEFINE_CGROUP_MGCTX(name) \
+ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
+
+struct cgroup_sb_opts {
+ u16 subsys_mask;
+ unsigned int flags;
+ char *release_agent;
+ bool cpuset_clone_children;
+ char *name;
+ /* User explicitly requested empty subsystem */
+ bool none;
+};
+
+extern struct mutex cgroup_mutex;
+extern spinlock_t css_set_lock;
+extern struct cgroup_subsys *cgroup_subsys[];
+extern struct list_head cgroup_roots;
+extern struct file_system_type cgroup_fs_type;
+
+/* iterate across the hierarchies */
+#define for_each_root(root) \
+ list_for_each_entry((root), &cgroup_roots, root_list)
+
+/**
+ * for_each_subsys - iterate all enabled cgroup subsystems
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ */
+#define for_each_subsys(ss, ssid) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
+ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
+
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
+{
+ return !(cgrp->self.flags & CSS_ONLINE);
+}
+
+static inline bool notify_on_release(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+}
+
+void put_css_set_locked(struct css_set *cset);
+
+static inline void put_css_set(struct css_set *cset)
+{
+ unsigned long flags;
+
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (atomic_add_unless(&cset->refcount, -1, 1))
+ return;
+
+ spin_lock_irqsave(&css_set_lock, flags);
+ put_css_set_locked(cset);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+}
+
+/*
+ * refcounted get/put for css_set objects
+ */
+static inline void get_css_set(struct css_set *cset)
+{
+ atomic_inc(&cset->refcount);
+}
+
+bool cgroup_ssid_enabled(int ssid);
+bool cgroup_on_dfl(const struct cgroup *cgrp);
+
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root);
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
+void cgroup_kn_unlock(struct kernfs_node *kn);
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns);
+
+void cgroup_free_root(struct cgroup_root *root);
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+ struct cgroup_root *root, unsigned long magic,
+ struct cgroup_namespace *ns);
+
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
+void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
+ struct cgroup_mgctx *mgctx);
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+ struct cgroup_mgctx *mgctx);
+
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+ bool threadgroup);
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off, bool threadgroup);
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+ loff_t off);
+
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
+int cgroup_rmdir(struct kernfs_node *kn);
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+ struct kernfs_root *kf_root);
+
+/*
+ * namespace.c
+ */
+extern const struct proc_ns_operations cgroupns_operations;
+
+/*
+ * cgroup-v1.c
+ */
+extern struct cftype cgroup1_base_files[];
+extern const struct file_operations proc_cgroupstats_operations;
+extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
+
+bool cgroup1_ssid_disabled(int ssid);
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
+void cgroup1_release_agent(struct work_struct *work);
+void cgroup1_check_for_release(struct cgroup *cgrp);
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+ void *data, unsigned long magic,
+ struct cgroup_namespace *ns);
+
+#endif /* __CGROUP_INTERNAL_H */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
new file mode 100644
index 000000000000..1dc22f6b49f5
--- /dev/null
+++ b/kernel/cgroup/cgroup-v1.c
@@ -0,0 +1,1398 @@
+#include "cgroup-internal.h"
+
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/sort.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/delayacct.h>
+#include <linux/pid_namespace.h>
+#include <linux/cgroupstats.h>
+
+#include <trace/events/cgroup.h>
+
+/*
+ * pidlists linger the following amount before being destroyed. The goal
+ * is avoiding frequent destruction in the middle of consecutive read calls
+ * Expiring in the middle is a performance problem not a correctness one.
+ * 1 sec should be enough.
+ */
+#define CGROUP_PIDLIST_DESTROY_DELAY HZ
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
+
+/*
+ * pidlist destructions need to be flushed on cgroup destruction. Use a
+ * separate workqueue as flush domain.
+ */
+static struct workqueue_struct *cgroup_pidlist_destroy_wq;
+
+/*
+ * Protects cgroup_subsys->release_agent_path. Modifying it also requires
+ * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
+
+bool cgroup1_ssid_disabled(int ssid)
+{
+ return cgroup_no_v1_mask & (1 << ssid);
+}
+
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+ struct cgroup_root *root;
+ int retval = 0;
+
+ mutex_lock(&cgroup_mutex);
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+ for_each_root(root) {
+ struct cgroup *from_cgrp;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ spin_lock_irq(&css_set_lock);
+ from_cgrp = task_cgroup_from_root(from, root);
+ spin_unlock_irq(&css_set_lock);
+
+ retval = cgroup_attach_task(from_cgrp, tsk, false);
+ if (retval)
+ break;
+ }
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ mutex_unlock(&cgroup_mutex);
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
+
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ *
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup. No task
+ * can slip out of migration through forking.
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+ DEFINE_CGROUP_MGCTX(mgctx);
+ struct cgrp_cset_link *link;
+ struct css_task_iter it;
+ struct task_struct *task;
+ int ret;
+
+ if (cgroup_on_dfl(to))
+ return -EINVAL;
+
+ if (!cgroup_may_migrate_to(to))
+ return -EBUSY;
+
+ mutex_lock(&cgroup_mutex);
+
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+
+ /* all tasks in @from are being moved, all csets are source */
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(link, &from->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, to, &mgctx);
+ spin_unlock_irq(&css_set_lock);
+
+ ret = cgroup_migrate_prepare_dst(&mgctx);
+ if (ret)
+ goto out_err;
+
+ /*
+ * Migrate tasks one-by-one until @from is empty. This fails iff
+ * ->can_attach() fails.
+ */
+ do {
+ css_task_iter_start(&from->self, &it);
+ task = css_task_iter_next(&it);
+ if (task)
+ get_task_struct(task);
+ css_task_iter_end(&it);
+
+ if (task) {
+ ret = cgroup_migrate(task, false, &mgctx);
+ if (!ret)
+ trace_cgroup_transfer_tasks(to, task, false);
+ put_task_struct(task);
+ }
+ } while (task && !ret);
+out_err:
+ cgroup_migrate_finish(&mgctx);
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+/*
+ * Stuff for reading the 'tasks'/'procs' files.
+ *
+ * Reading this file can return large amounts of data if a cgroup has
+ * *lots* of attached tasks. So it may need several calls to read(),
+ * but we cannot guarantee that the information we produce is correct
+ * unless we produce it entirely atomically.
+ *
+ */
+
+/* which pidlist file are we talking about? */
+enum cgroup_filetype {
+ CGROUP_FILE_PROCS,
+ CGROUP_FILE_TASKS,
+};
+
+/*
+ * A pidlist is a list of pids that virtually represents the contents of one
+ * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
+ * a pair (one each for procs, tasks) for each pid namespace that's relevant
+ * to the cgroup.
+ */
+struct cgroup_pidlist {
+ /*
+ * used to find which pidlist is wanted. doesn't change as long as
+ * this particular list stays in the list.
+ */
+ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
+ /* array of xids */
+ pid_t *list;
+ /* how many elements the above list has */
+ int length;
+ /* each of these stored in a list by its cgroup */
+ struct list_head links;
+ /* pointer to the cgroup we belong to, for list removal purposes */
+ struct cgroup *owner;
+ /* for delayed destruction */
+ struct delayed_work destroy_dwork;
+};
+
+/*
+ * The following two functions "fix" the issue where there are more pids
+ * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
+ * TODO: replace with a kernel-wide solution to this problem
+ */
+#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
+static void *pidlist_allocate(int count)
+{
+ if (PIDLIST_TOO_LARGE(count))
+ return vmalloc(count * sizeof(pid_t));
+ else
+ return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
+}
+
+static void pidlist_free(void *p)
+{
+ kvfree(p);
+}
+
+/*
+ * Used to destroy all pidlists lingering waiting for destroy timer. None
+ * should be left afterwards.
+ */
+void cgroup1_pidlist_destroy_all(struct cgroup *cgrp)
+{
+ struct cgroup_pidlist *l, *tmp_l;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+ list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
+ mutex_unlock(&cgrp->pidlist_mutex);
+
+ flush_workqueue(cgroup_pidlist_destroy_wq);
+ BUG_ON(!list_empty(&cgrp->pidlists));
+}
+
+static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
+ destroy_dwork);
+ struct cgroup_pidlist *tofree = NULL;
+
+ mutex_lock(&l->owner->pidlist_mutex);
+
+ /*
+ * Destroy iff we didn't get queued again. The state won't change
+ * as destroy_dwork can only be queued while locked.
+ */
+ if (!delayed_work_pending(dwork)) {
+ list_del(&l->links);
+ pidlist_free(l->list);
+ put_pid_ns(l->key.ns);
+ tofree = l;
+ }
+
+ mutex_unlock(&l->owner->pidlist_mutex);
+ kfree(tofree);
+}
+
+/*
+ * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
+ * Returns the number of unique elements.
+ */
+static int pidlist_uniq(pid_t *list, int length)
+{
+ int src, dest = 1;
+
+ /*
+ * we presume the 0th element is unique, so i starts at 1. trivial
+ * edge cases first; no work needs to be done for either
+ */
+ if (length == 0 || length == 1)
+ return length;
+ /* src and dest walk down the list; dest counts unique elements */
+ for (src = 1; src < length; src++) {
+ /* find next unique element */
+ while (list[src] == list[src-1]) {
+ src++;
+ if (src == length)
+ goto after;
+ }
+ /* dest always points to where the next unique element goes */
+ list[dest] = list[src];
+ dest++;
+ }
+after:
+ return dest;
+}
+
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco. As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer. As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ */
+static int cmppid(const void *a, const void *b)
+{
+ return *(pid_t *)a - *(pid_t *)b;
+}
+
+static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ struct pid_namespace *ns = task_active_pid_ns(current);
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ list_for_each_entry(l, &cgrp->pidlists, links)
+ if (l->key.type == type && l->key.ns == ns)
+ return l;
+ return NULL;
+}
+
+/*
+ * find the appropriate pidlist for our purpose (given procs vs tasks)
+ * returns with the lock on that pidlist already held, and takes care
+ * of the use count, or returns NULL with no locks held if we're out of
+ * memory.
+ */
+static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
+ enum cgroup_filetype type)
+{
+ struct cgroup_pidlist *l;
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ l = cgroup_pidlist_find(cgrp, type);
+ if (l)
+ return l;
+
+ /* entry not found; create a new one */
+ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+ if (!l)
+ return l;
+
+ INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
+ l->key.type = type;
+ /* don't need task_nsproxy() if we're looking at ourself */
+ l->key.ns = get_pid_ns(task_active_pid_ns(current));
+ l->owner = cgrp;
+ list_add(&l->links, &cgrp->pidlists);
+ return l;
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup. The returned number can be
+ * higher than the actual number of tasks due to css_set references from
+ * namespace roots and temporary usages.
+ */
+static int cgroup_task_count(const struct cgroup *cgrp)
+{
+ int count = 0;
+ struct cgrp_cset_link *link;
+
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ count += atomic_read(&link->cset->refcount);
+ spin_unlock_irq(&css_set_lock);
+ return count;
+}
+
+/*
+ * Load a cgroup's pidarray with either procs' tgids or tasks' pids
+ */
+static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
+ struct cgroup_pidlist **lp)
+{
+ pid_t *array;
+ int length;
+ int pid, n = 0; /* used for populating the array */
+ struct css_task_iter it;
+ struct task_struct *tsk;
+ struct cgroup_pidlist *l;
+
+ lockdep_assert_held(&cgrp->pidlist_mutex);
+
+ /*
+ * If cgroup gets more users after we read count, we won't have
+ * enough space - tough. This race is indistinguishable to the
+ * caller from the case that the additional cgroup users didn't
+ * show up until sometime later on.
+ */
+ length = cgroup_task_count(cgrp);
+ array = pidlist_allocate(length);
+ if (!array)
+ return -ENOMEM;
+ /* now, populate the array */
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
+ if (unlikely(n == length))
+ break;
+ /* get tgid or pid for procs or tasks file respectively */
+ if (type == CGROUP_FILE_PROCS)
+ pid = task_tgid_vnr(tsk);
+ else
+ pid = task_pid_vnr(tsk);
+ if (pid > 0) /* make sure to only use valid results */
+ array[n++] = pid;
+ }
+ css_task_iter_end(&it);
+ length = n;
+ /* now sort & (if procs) strip out duplicates */
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
+ if (type == CGROUP_FILE_PROCS)
+ length = pidlist_uniq(array, length);
+
+ l = cgroup_pidlist_find_create(cgrp, type);
+ if (!l) {
+ pidlist_free(array);
+ return -ENOMEM;
+ }
+
+ /* store array, freeing old if necessary */
+ pidlist_free(l->list);
+ l->list = array;
+ l->length = length;
+ *lp = l;
+ return 0;
+}
+
+/*
+ * seq_file methods for the tasks/procs files. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->l->list array.
+ */
+
+static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+{
+ /*
+ * Initially we receive a position value that corresponds to
+ * one more than the last pid shown (or 0 on the first call or
+ * after a seek to the start). Use a binary-search to find the
+ * next pid to display, if any
+ */
+ struct kernfs_open_file *of = s->private;
+ struct cgroup *cgrp = seq_css(s)->cgroup;
+ struct cgroup_pidlist *l;
+ enum cgroup_filetype type = seq_cft(s)->private;
+ int index = 0, pid = *pos;
+ int *iter, ret;
+
+ mutex_lock(&cgrp->pidlist_mutex);
+
+ /*
+ * !NULL @of->priv indicates that this isn't the first start()
+ * after open. If the matching pidlist is around, we can use that.
+ * Look for it. Note that @of->priv can't be used directly. It
+ * could already have been destroyed.
+ */
+ if (of->priv)
+ of->priv = cgroup_pidlist_find(cgrp, type);
+
+ /*
+ * Either this is the first start() after open or the matching
+ * pidlist has been destroyed inbetween. Create a new one.
+ */
+ if (!of->priv) {
+ ret = pidlist_array_load(cgrp, type,
+ (struct cgroup_pidlist **)&of->priv);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+ l = of->priv;
+
+ if (pid) {
+ int end = l->length;
+
+ while (index < end) {
+ int mid = (index + end) / 2;
+ if (l->list[mid] == pid) {
+ index = mid;
+ break;
+ } else if (l->list[mid] <= pid)
+ index = mid + 1;
+ else
+ end = mid;
+ }
+ }
+ /* If we're off the end of the array, we're done */
+ if (index >= l->length)
+ return NULL;
+ /* Update the abstract position to be the actual pid that we found */
+ iter = l->list + index;
+ *pos = *iter;
+ return iter;
+}
+
+static void cgroup_pidlist_stop(struct seq_file *s, void *v)
+{
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+
+ if (l)
+ mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
+ CGROUP_PIDLIST_DESTROY_DELAY);
+ mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
+}
+
+static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct kernfs_open_file *of = s->private;
+ struct cgroup_pidlist *l = of->priv;
+ pid_t *p = v;
+ pid_t *end = l->list + l->length;
+ /*
+ * Advance to the next pid in the array. If this goes off the
+ * end, we're done
+ */
+ p++;
+ if (p >= end) {
+ return NULL;
+ } else {
+ *pos = *p;
+ return p;
+ }
+}
+
+static int cgroup_pidlist_show(struct seq_file *s, void *v)
+{
+ seq_printf(s, "%d\n", *(int *)v);
+
+ return 0;
+}
+
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, nbytes, off, false);
+}
+
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+
+ BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+ spin_lock(&release_agent_path_lock);
+ strlcpy(cgrp->root->release_agent_path, strstrip(buf),
+ sizeof(cgrp->root->release_agent_path));
+ spin_unlock(&release_agent_path_lock);
+ cgroup_kn_unlock(of->kn);
+ return nbytes;
+}
+
+static int cgroup_release_agent_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ spin_lock(&release_agent_path_lock);
+ seq_puts(seq, cgrp->root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
+ seq_putc(seq, '\n');
+ return 0;
+}
+
+static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
+{
+ seq_puts(seq, "0\n");
+ return 0;
+}
+
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return notify_on_release(css->cgroup);
+}
+
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ if (val)
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
+ return 0;
+}
+
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+}
+
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ if (val)
+ set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+ else
+ clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+ return 0;
+}
+
+/* cgroup core interface files for the legacy hierarchies */
+struct cftype cgroup1_base_files[] = {
+ {
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
+ },
+ {
+ .name = "cgroup.clone_children",
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
+ {
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_sane_behavior_show,
+ },
+ {
+ .name = "tasks",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_TASKS,
+ .write = cgroup_tasks_write,
+ },
+ {
+ .name = "notify_on_release",
+ .read_u64 = cgroup_read_notify_on_release,
+ .write_u64 = cgroup_write_notify_on_release,
+ },
+ {
+ .name = "release_agent",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_release_agent_show,
+ .write = cgroup_release_agent_write,
+ .max_write_len = PATH_MAX - 1,
+ },
+ { } /* terminate */
+};
+
+/* Display information about each subsystem and each hierarchy */
+static int proc_cgroupstats_show(struct seq_file *m, void *v)
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+ /*
+ * ideally we don't want subsystems moving around while we do this.
+ * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+ * subsys/hierarchy state.
+ */
+ mutex_lock(&cgroup_mutex);
+
+ for_each_subsys(ss, i)
+ seq_printf(m, "%s\t%d\t%d\t%d\n",
+ ss->legacy_name, ss->root->hierarchy_id,
+ atomic_read(&ss->root->nr_cgrps),
+ cgroup_ssid_enabled(i));
+
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+static int cgroupstats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_cgroupstats_show, NULL);
+}
+
+const struct file_operations proc_cgroupstats_operations = {
+ .open = cgroupstats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/**
+ * cgroupstats_build - build and fill cgroupstats
+ * @stats: cgroupstats to fill information into
+ * @dentry: A dentry entry belonging to the cgroup for which stats have
+ * been requested.
+ *
+ * Build and fill cgroupstats so that taskstats can export it to user
+ * space.
+ */
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+{
+ struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct cgroup *cgrp;
+ struct css_task_iter it;
+ struct task_struct *tsk;
+
+ /* it should be kernfs_node belonging to cgroupfs and is a directory */
+ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+ kernfs_type(kn) != KERNFS_DIR)
+ return -EINVAL;
+
+ mutex_lock(&cgroup_mutex);
+
+ /*
+ * We aren't being called from kernfs and there's no guarantee on
+ * @kn->priv's validity. For this and css_tryget_online_from_dir(),
+ * @kn->priv is RCU safe. Let's do the RCU dancing.
+ */
+ rcu_read_lock();
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (!cgrp || cgroup_is_dead(cgrp)) {
+ rcu_read_unlock();
+ mutex_unlock(&cgroup_mutex);
+ return -ENOENT;
+ }
+ rcu_read_unlock();
+
+ css_task_iter_start(&cgrp->self, &it);
+ while ((tsk = css_task_iter_next(&it))) {
+ switch (tsk->state) {
+ case TASK_RUNNING:
+ stats->nr_running++;
+ break;
+ case TASK_INTERRUPTIBLE:
+ stats->nr_sleeping++;
+ break;
+ case TASK_UNINTERRUPTIBLE:
+ stats->nr_uninterruptible++;
+ break;
+ case TASK_STOPPED:
+ stats->nr_stopped++;
+ break;
+ default:
+ if (delayacct_is_task_waiting_on_io(tsk))
+ stats->nr_io_wait++;
+ break;
+ }
+ }
+ css_task_iter_end(&it);
+
+ mutex_unlock(&cgroup_mutex);
+ return 0;
+}
+
+void cgroup1_check_for_release(struct cgroup *cgrp)
+{
+ if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
+ !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+ schedule_work(&cgrp->release_agent_work);
+}
+
+/*
+ * Notify userspace when a cgroup is released, by running the
+ * configured release agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cgroup.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cgroup before it is removed, or that some other
+ * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
+ * unused, and this cgroup will be reprieved from its death sentence,
+ * to continue to serve a useful existence. Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d. The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task. We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ */
+void cgroup1_release_agent(struct work_struct *work)
+{
+ struct cgroup *cgrp =
+ container_of(work, struct cgroup, release_agent_work);
+ char *pathbuf = NULL, *agentbuf = NULL;
+ char *argv[3], *envp[3];
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+ agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+ if (!pathbuf || !agentbuf)
+ goto out;
+
+ spin_lock_irq(&css_set_lock);
+ ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+ spin_unlock_irq(&css_set_lock);
+ if (ret < 0 || ret >= PATH_MAX)
+ goto out;
+
+ argv[0] = agentbuf;
+ argv[1] = pathbuf;
+ argv[2] = NULL;
+
+ /* minimal command environment */
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[2] = NULL;
+
+ mutex_unlock(&cgroup_mutex);
+ call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ goto out_free;
+out:
+ mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(agentbuf);
+ kfree(pathbuf);
+}
+
+/*
+ * cgroup_rename - Only allow simple rename of directories in place.
+ */
+static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+ const char *new_name_str)
+{
+ struct cgroup *cgrp = kn->priv;
+ int ret;
+
+ if (kernfs_type(kn) != KERNFS_DIR)
+ return -ENOTDIR;
+ if (kn->parent != new_parent)
+ return -EIO;
+
+ /*
+ * We're gonna grab cgroup_mutex which nests outside kernfs
+ * active_ref. kernfs_rename() doesn't require active_ref
+ * protection. Break them before grabbing cgroup_mutex.
+ */
+ kernfs_break_active_protection(new_parent);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&cgroup_mutex);
+
+ ret = kernfs_rename(kn, new_parent, new_name_str);
+ if (!ret)
+ trace_cgroup_rename(cgrp);
+
+ mutex_unlock(&cgroup_mutex);
+
+ kernfs_unbreak_active_protection(kn);
+ kernfs_unbreak_active_protection(new_parent);
+ return ret;
+}
+
+static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_show_option(seq, ss->legacy_name, NULL);
+ if (root->flags & CGRP_ROOT_NOPREFIX)
+ seq_puts(seq, ",noprefix");
+ if (root->flags & CGRP_ROOT_XATTR)
+ seq_puts(seq, ",xattr");
+
+ spin_lock(&release_agent_path_lock);
+ if (strlen(root->release_agent_path))
+ seq_show_option(seq, "release_agent",
+ root->release_agent_path);
+ spin_unlock(&release_agent_path_lock);
+
+ if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
+ seq_puts(seq, ",clone_children");
+ if (strlen(root->name))
+ seq_show_option(seq, "name", root->name);
+ return 0;
+}
+
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
+{
+ char *token, *o = data;
+ bool all_ss = false, one_ss = false;
+ u16 mask = U16_MAX;
+ struct cgroup_subsys *ss;
+ int nr_opts = 0;
+ int i;
+
+#ifdef CONFIG_CPUSETS
+ mask = ~((u16)1 << cpuset_cgrp_id);
+#endif
+
+ memset(opts, 0, sizeof(*opts));
+
+ while ((token = strsep(&o, ",")) != NULL) {
+ nr_opts++;
+
+ if (!*token)
+ return -EINVAL;
+ if (!strcmp(token, "none")) {
+ /* Explicitly have no subsystems */
+ opts->none = true;
+ continue;
+ }
+ if (!strcmp(token, "all")) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (one_ss)
+ return -EINVAL;
+ all_ss = true;
+ continue;
+ }
+ if (!strcmp(token, "noprefix")) {
+ opts->flags |= CGRP_ROOT_NOPREFIX;
+ continue;
+ }
+ if (!strcmp(token, "clone_children")) {
+ opts->cpuset_clone_children = true;
+ continue;
+ }
+ if (!strcmp(token, "xattr")) {
+ opts->flags |= CGRP_ROOT_XATTR;
+ continue;
+ }
+ if (!strncmp(token, "release_agent=", 14)) {
+ /* Specifying two release agents is forbidden */
+ if (opts->release_agent)
+ return -EINVAL;
+ opts->release_agent =
+ kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
+ if (!opts->release_agent)
+ return -ENOMEM;
+ continue;
+ }
+ if (!strncmp(token, "name=", 5)) {
+ const char *name = token + 5;
+ /* Can't specify an empty name */
+ if (!strlen(name))
+ return -EINVAL;
+ /* Must match [\w.-]+ */
+ for (i = 0; i < strlen(name); i++) {
+ char c = name[i];
+ if (isalnum(c))
+ continue;
+ if ((c == '.') || (c == '-') || (c == '_'))
+ continue;
+ return -EINVAL;
+ }
+ /* Specifying two names is forbidden */
+ if (opts->name)
+ return -EINVAL;
+ opts->name = kstrndup(name,
+ MAX_CGROUP_ROOT_NAMELEN - 1,
+ GFP_KERNEL);
+ if (!opts->name)
+ return -ENOMEM;
+
+ continue;
+ }
+
+ for_each_subsys(ss, i) {
+ if (strcmp(token, ss->legacy_name))
+ continue;
+ if (!cgroup_ssid_enabled(i))
+ continue;
+ if (cgroup1_ssid_disabled(i))
+ continue;
+
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (all_ss)
+ return -EINVAL;
+ opts->subsys_mask |= (1 << i);
+ one_ss = true;
+
+ break;
+ }
+ if (i == CGROUP_SUBSYS_COUNT)
+ return -ENOENT;
+ }
+
+ /*
+ * If the 'all' option was specified select all the subsystems,
+ * otherwise if 'none', 'name=' and a subsystem name options were
+ * not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name))
+ for_each_subsys(ss, i)
+ if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i))
+ opts->subsys_mask |= (1 << i);
+
+ /*
+ * We either have to specify by name or by subsystems. (So all
+ * empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
+
+ /*
+ * Option noprefix was introduced just for backward compatibility
+ * with the old cpuset, so we allow noprefix only if mounting just
+ * the cpuset subsystem.
+ */
+ if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
+ return -EINVAL;
+
+ /* Can't specify "none" and some subsystems */
+ if (opts->subsys_mask && opts->none)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data)
+{
+ int ret = 0;
+ struct cgroup_root *root = cgroup_root_from_kf(kf_root);
+ struct cgroup_sb_opts opts;
+ u16 added_mask, removed_mask;
+
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+ /* See what subsystems are wanted */
+ ret = parse_cgroupfs_options(data, &opts);
+ if (ret)
+ goto out_unlock;
+
+ if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+ pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
+ task_tgid_nr(current), current->comm);
+
+ added_mask = opts.subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~opts.subsys_mask;
+
+ /* Don't allow flags or name to change at remount */
+ if ((opts.flags ^ root->flags) ||
+ (opts.name && strcmp(opts.name, root->name))) {
+ pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
+ opts.flags, opts.name ?: "", root->flags, root->name);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* remounting is not allowed for populated hierarchies */
+ if (!list_empty(&root->cgrp.self.children)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ ret = rebind_subsystems(root, added_mask);
+ if (ret)
+ goto out_unlock;
+
+ WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
+
+ if (opts.release_agent) {
+ spin_lock(&release_agent_path_lock);
+ strcpy(root->release_agent_path, opts.release_agent);
+ spin_unlock(&release_agent_path_lock);
+ }
+
+ trace_cgroup_remount(root);
+
+ out_unlock:
+ kfree(opts.release_agent);
+ kfree(opts.name);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+struct kernfs_syscall_ops cgroup1_kf_syscall_ops = {
+ .rename = cgroup1_rename,
+ .show_options = cgroup1_show_options,
+ .remount_fs = cgroup1_remount,
+ .mkdir = cgroup_mkdir,
+ .rmdir = cgroup_rmdir,
+ .show_path = cgroup_show_path,
+};
+
+struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
+ void *data, unsigned long magic,
+ struct cgroup_namespace *ns)
+{
+ struct super_block *pinned_sb = NULL;
+ struct cgroup_sb_opts opts;
+ struct cgroup_root *root;
+ struct cgroup_subsys *ss;
+ struct dentry *dentry;
+ int i, ret;
+
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
+
+ /* First find the desired set of subsystems */
+ ret = parse_cgroupfs_options(data, &opts);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Destruction of cgroup root is asynchronous, so subsystems may
+ * still be dying after the previous unmount. Let's drain the
+ * dying subsystems. We just need to ensure that the ones
+ * unmounted previously finish dying and don't care about new ones
+ * starting. Testing ref liveliness is good enough.
+ */
+ for_each_subsys(ss, i) {
+ if (!(opts.subsys_mask & (1 << i)) ||
+ ss->root == &cgrp_dfl_root)
+ continue;
+
+ if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+ cgroup_put(&ss->root->cgrp);
+ }
+
+ for_each_root(root) {
+ bool name_match = false;
+
+ if (root == &cgrp_dfl_root)
+ continue;
+
+ /*
+ * If we asked for a name then it must match. Also, if
+ * name matches but sybsys_mask doesn't, we should fail.
+ * Remember whether name matched.
+ */
+ if (opts.name) {
+ if (strcmp(opts.name, root->name))
+ continue;
+ name_match = true;
+ }
+
+ /*
+ * If we asked for subsystems (or explicitly for no
+ * subsystems) then they must match.
+ */
+ if ((opts.subsys_mask || opts.none) &&
+ (opts.subsys_mask != root->subsys_mask)) {
+ if (!name_match)
+ continue;
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ if (root->flags ^ opts.flags)
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
+
+ /*
+ * We want to reuse @root whose lifetime is governed by its
+ * ->cgrp. Let's check whether @root is alive and keep it
+ * that way. As cgroup_kill_sb() can happen anytime, we
+ * want to block it by pinning the sb so that @root doesn't
+ * get killed before mount is complete.
+ *
+ * With the sb pinned, tryget_live can reliably indicate
+ * whether @root can be reused. If it's being killed,
+ * drain it. We can use wait_queue for the wait but this
+ * path is super cold. Let's just sleep a bit and retry.
+ */
+ pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+ if (IS_ERR(pinned_sb) ||
+ !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ if (!IS_ERR_OR_NULL(pinned_sb))
+ deactivate_super(pinned_sb);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * No such thing, create a new one. name= matching without subsys
+ * specification is allowed for already existing hierarchies but we
+ * can't create new one without subsys specification.
+ */
+ if (!opts.subsys_mask && !opts.none) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* Hierarchies may only be created in the initial cgroup namespace. */
+ if (ns != &init_cgroup_ns) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ init_cgroup_root(root, &opts);
+
+ ret = cgroup_setup_root(root, opts.subsys_mask);
+ if (ret)
+ cgroup_free_root(root);
+
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(opts.release_agent);
+ kfree(opts.name);
+
+ if (ret)
+ return ERR_PTR(ret);
+
+ dentry = cgroup_do_mount(&cgroup_fs_type, flags, root,
+ CGROUP_SUPER_MAGIC, ns);
+
+ /*
+ * If @pinned_sb, we're reusing an existing root and holding an
+ * extra ref on its sb. Mount is complete. Put the extra ref.
+ */
+ if (pinned_sb)
+ deactivate_super(pinned_sb);
+
+ return dentry;
+}
+
+static int __init cgroup1_wq_init(void)
+{
+ /*
+ * Used to destroy pidlists and separate to serve as flush domain.
+ * Cap @max_active to 1 too.
+ */
+ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
+ 0, 1);
+ BUG_ON(!cgroup_pidlist_destroy_wq);
+ return 0;
+}
+core_initcall(cgroup1_wq_init);
+
+static int __init cgroup_no_v1(char *str)
+{
+ struct cgroup_subsys *ss;
+ char *token;
+ int i;
+
+ while ((token = strsep(&str, ",")) != NULL) {
+ if (!*token)
+ continue;
+
+ if (!strcmp(token, "all")) {
+ cgroup_no_v1_mask = U16_MAX;
+ break;
+ }
+
+ for_each_subsys(ss, i) {
+ if (strcmp(token, ss->name) &&
+ strcmp(token, ss->legacy_name))
+ continue;
+
+ cgroup_no_v1_mask |= 1 << i;
+ }
+ }
+ return 1;
+}
+__setup("cgroup_no_v1=", cgroup_no_v1);
+
+
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return cgroup_task_count(css->cgroup);
+}
+
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 count;
+
+ rcu_read_lock();
+ count = atomic_read(&task_css_set(current)->refcount);
+ rcu_read_unlock();
+ return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+ char *name_buf;
+
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
+
+ spin_lock_irq(&css_set_lock);
+ rcu_read_lock();
+ cset = rcu_dereference(current->cgroups);
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ cgroup_name(c, name_buf, NAME_MAX + 1);
+ seq_printf(seq, "Root %d group %s\n",
+ c->root->hierarchy_id, name_buf);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&css_set_lock);
+ kfree(name_buf);
+ return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(seq);
+ struct cgrp_cset_link *link;
+
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+ struct css_set *cset = link->cset;
+ struct task_struct *task;
+ int count = 0;
+
+ seq_printf(seq, "css_set %pK\n", cset);
+
+ list_for_each_entry(task, &cset->tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+
+ list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+ goto overflow;
+ seq_printf(seq, " task %d\n", task_pid_vnr(task));
+ }
+ continue;
+ overflow:
+ seq_puts(seq, " ...\n");
+ }
+ spin_unlock_irq(&css_set_lock);
+ return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return (!cgroup_is_populated(css->cgroup) &&
+ !css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_files[] = {
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .read_u64 = current_css_set_read,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ },
+
+ {
+ .name = "current_css_set_cg_links",
+ .seq_show = current_css_set_cg_links_read,
+ },
+
+ {
+ .name = "cgroup_css_links",
+ .seq_show = cgroup_css_links_read,
+ },
+
+ {
+ .name = "releasable",
+ .read_u64 = releasable_read,
+ },
+
+ { } /* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
+ .legacy_cftypes = debug_files,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup.c b/kernel/cgroup/cgroup.c
index 53bbca7c4859..687f5e0194ef 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -28,35 +28,27 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/cgroup.h>
+#include "cgroup-internal.h"
+
#include <linux/cred.h>
-#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
-#include <linux/list.h>
#include <linux/magic.h>
-#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/percpu-rwsem.h>
#include <linux/string.h>
-#include <linux/sort.h>
-#include <linux/kmod.h>
-#include <linux/delayacct.h>
-#include <linux/cgroupstats.h>
#include <linux/hashtable.h>
-#include <linux/pid_namespace.h>
#include <linux/idr.h>
-#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
-#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
@@ -67,14 +59,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>
-/*
- * pidlists linger the following amount before being destroyed. The goal
- * is avoiding frequent destruction in the middle of consecutive read calls
- * Expiring in the middle is a performance problem not a correctness one.
- * 1 sec should be enough.
- */
-#define CGROUP_PIDLIST_DESTROY_DELAY HZ
-
#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
MAX_CFTYPE_NAME + 2)
@@ -88,14 +72,12 @@
* These locks are exported if CONFIG_PROVE_RCU so that accessors in
* cgroup.h can use them for lockdep annotations.
*/
-#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
DEFINE_SPINLOCK(css_set_lock);
+
+#ifdef CONFIG_PROVE_RCU
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
-#else
-static DEFINE_MUTEX(cgroup_mutex);
-static DEFINE_SPINLOCK(css_set_lock);
#endif
/*
@@ -110,12 +92,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
*/
static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-/*
- * Protects cgroup_subsys->release_agent_path. Modifying it also requires
- * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
- */
-static DEFINE_SPINLOCK(release_agent_path_lock);
-
struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
#define cgroup_assert_mutex_or_rcu_locked() \
@@ -131,15 +107,9 @@ struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
*/
static struct workqueue_struct *cgroup_destroy_wq;
-/*
- * pidlist destructions need to be flushed on cgroup destruction. Use a
- * separate workqueue as flush domain.
- */
-static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
-static struct cgroup_subsys *cgroup_subsys[] = {
+struct cgroup_subsys *cgroup_subsys[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
@@ -186,18 +156,14 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
*/
static bool cgrp_dfl_visible;
-/* Controllers blocked by the commandline in v1 */
-static u16 cgroup_no_v1_mask;
-
/* some controllers are not supported in the default hierarchy */
static u16 cgrp_dfl_inhibit_ss_mask;
/* some controllers are implicitly enabled on the default hierarchy */
-static unsigned long cgrp_dfl_implicit_ss_mask;
+static u16 cgrp_dfl_implicit_ss_mask;
/* The list of hierarchy roots */
-
-static LIST_HEAD(cgroup_roots);
+LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
@@ -213,13 +179,13 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
static u64 css_serial_nr_next = 1;
/*
- * These bitmask flags indicate whether tasks in the fork and exit paths have
- * fork/exit handlers to call. This avoids us having to do extra work in the
- * fork/exit path to check which subsystems have fork/exit callbacks.
+ * These bitmasks identify subsystems with specific features to avoid
+ * having to do iterative checks repeatedly.
*/
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
static u16 have_free_callback __read_mostly;
+static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
@@ -230,15 +196,9 @@ struct cgroup_namespace init_cgroup_ns = {
.root_cset = &init_css_set,
};
-/* Ditto for the can_fork callback. */
-static u16 have_canfork_callback __read_mostly;
-
static struct file_system_type cgroup2_fs_type;
-static struct cftype cgroup_dfl_base_files[];
-static struct cftype cgroup_legacy_base_files[];
+static struct cftype cgroup_base_files[];
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_advance(struct css_task_iter *it);
@@ -259,7 +219,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
* is fine for individual subsystems but unsuitable for cgroup core. This
* is slower static_key_enabled() based test indexed by @ssid.
*/
-static bool cgroup_ssid_enabled(int ssid)
+bool cgroup_ssid_enabled(int ssid)
{
if (CGROUP_SUBSYS_COUNT == 0)
return false;
@@ -267,11 +227,6 @@ static bool cgroup_ssid_enabled(int ssid)
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}
-static bool cgroup_ssid_no_v1(int ssid)
-{
- return cgroup_no_v1_mask & (1 << ssid);
-}
-
/**
* cgroup_on_dfl - test whether a cgroup is on the default hierarchy
* @cgrp: the cgroup of interest
@@ -325,7 +280,7 @@ static bool cgroup_ssid_no_v1(int ssid)
*
* - debug: disallowed on the default hierarchy.
*/
-static bool cgroup_on_dfl(const struct cgroup *cgrp)
+bool cgroup_on_dfl(const struct cgroup *cgrp)
{
return cgrp->root == &cgrp_dfl_root;
}
@@ -481,12 +436,6 @@ out_unlock:
return css;
}
-/* convenient tests for these bits */
-static inline bool cgroup_is_dead(const struct cgroup *cgrp)
-{
- return !(cgrp->self.flags & CSS_ONLINE);
-}
-
static void cgroup_get(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
@@ -518,11 +467,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
}
EXPORT_SYMBOL_GPL(of_css);
-static int notify_on_release(const struct cgroup *cgrp)
-{
- return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-}
-
/**
* for_each_css - iterate all css's of a cgroup
* @css: the iteration cursor
@@ -553,15 +497,6 @@ static int notify_on_release(const struct cgroup *cgrp)
else
/**
- * for_each_subsys - iterate all enabled cgroup subsystems
- * @ss: the iteration cursor
- * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- */
-#define for_each_subsys(ss, ssid) \
- for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
- (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
-
-/**
* do_each_subsys_mask - filter for_each_subsys with a bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -585,10 +520,6 @@ static int notify_on_release(const struct cgroup *cgrp)
} \
} while (false)
-/* iterate across the hierarchies */
-#define for_each_root(root) \
- list_for_each_entry((root), &cgroup_roots, root_list)
-
/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp) \
list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
@@ -615,29 +546,6 @@ static int notify_on_release(const struct cgroup *cgrp)
; \
else
-static void cgroup_release_agent(struct work_struct *work);
-static void check_for_release(struct cgroup *cgrp);
-
-/*
- * A cgroup can be associated with multiple css_sets as different tasks may
- * belong to different cgroups on different hierarchies. In the other
- * direction, a css_set is naturally associated with multiple cgroups.
- * This M:N relationship is represented by the following link structure
- * which exists for each association and allows traversing the associations
- * from both sides.
- */
-struct cgrp_cset_link {
- /* the cgroup and css_set this link associates */
- struct cgroup *cgrp;
- struct css_set *cset;
-
- /* list of cgrp_cset_links anchored at cgrp->cset_links */
- struct list_head cset_link;
-
- /* list of cgrp_cset_links anchored at css_set->cgrp_links */
- struct list_head cgrp_link;
-};
-
/*
* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
@@ -647,12 +555,12 @@ struct cgrp_cset_link {
*/
struct css_set init_css_set = {
.refcount = ATOMIC_INIT(1),
- .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
+ .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
- .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
};
static int css_set_count = 1; /* 1 for init_css_set */
@@ -699,7 +607,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
if (!trigger)
break;
- check_for_release(cgrp);
+ cgroup1_check_for_release(cgrp);
cgroup_file_notify(&cgrp->events_file);
cgrp = cgroup_parent(cgrp);
@@ -808,7 +716,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
return key;
}
-static void put_css_set_locked(struct css_set *cset)
+void put_css_set_locked(struct css_set *cset)
{
struct cgrp_cset_link *link, *tmp_link;
struct cgroup_subsys *ss;
@@ -838,31 +746,6 @@ static void put_css_set_locked(struct css_set *cset)
kfree_rcu(cset, rcu_head);
}
-static void put_css_set(struct css_set *cset)
-{
- unsigned long flags;
-
- /*
- * Ensure that the refcount doesn't hit zero while any readers
- * can see it. Similar to atomic_dec_and_lock(), but for an
- * rwlock
- */
- if (atomic_add_unless(&cset->refcount, -1, 1))
- return;
-
- spin_lock_irqsave(&css_set_lock, flags);
- put_css_set_locked(cset);
- spin_unlock_irqrestore(&css_set_lock, flags);
-}
-
-/*
- * refcounted get/put for css_set objects
- */
-static inline void get_css_set(struct css_set *cset)
-{
- atomic_inc(&cset->refcount);
-}
-
/**
* compare_css_sets - helper function for find_existing_css_set().
* @cset: candidate css_set being tested
@@ -1095,13 +978,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,
}
atomic_set(&cset->refcount, 1);
- INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
- INIT_LIST_HEAD(&cset->mg_preload_node);
- INIT_LIST_HEAD(&cset->mg_node);
INIT_LIST_HEAD(&cset->task_iters);
INIT_HLIST_NODE(&cset->hlist);
+ INIT_LIST_HEAD(&cset->cgrp_links);
+ INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
@@ -1138,7 +1021,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
return cset;
}
-static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -1166,7 +1049,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
}
-static void cgroup_free_root(struct cgroup_root *root)
+void cgroup_free_root(struct cgroup_root *root)
{
if (root) {
idr_destroy(&root->cgroup_idr);
@@ -1283,8 +1166,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex and css_set_lock held.
*/
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
- struct cgroup_root *root)
+struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroup_root *root)
{
/*
* No need to lock the task - since we hold cgroup_mutex the
@@ -1321,7 +1204,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
*/
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
-static const struct file_operations proc_cgroupstats_operations;
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
char *buf)
@@ -1415,7 +1297,7 @@ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
* inaccessible any time. If the caller intends to continue to access the
* cgroup, it should pin it before invoking this function.
*/
-static void cgroup_kn_unlock(struct kernfs_node *kn)
+void cgroup_kn_unlock(struct kernfs_node *kn)
{
struct cgroup *cgrp;
@@ -1447,8 +1329,7 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
* locking under kernfs active protection and allows all kernfs operations
* including self-removal.
*/
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
- bool drain_offline)
+struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
{
struct cgroup *cgrp;
@@ -1532,9 +1413,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
if (!css->ss) {
if (cgroup_on_dfl(cgrp))
- cfts = cgroup_dfl_base_files;
+ cfts = cgroup_base_files;
else
- cfts = cgroup_legacy_base_files;
+ cfts = cgroup1_base_files;
return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
}
@@ -1559,7 +1440,7 @@ err:
return ret;
}
-static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
+int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
@@ -1629,8 +1510,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
return 0;
}
-static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
- struct kernfs_root *kf_root)
+int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+ struct kernfs_root *kf_root)
{
int len = 0;
char *buf = NULL;
@@ -1656,237 +1537,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
return len;
}
-static int cgroup_show_options(struct seq_file *seq,
- struct kernfs_root *kf_root)
-{
- struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- struct cgroup_subsys *ss;
- int ssid;
-
- if (root != &cgrp_dfl_root)
- for_each_subsys(ss, ssid)
- if (root->subsys_mask & (1 << ssid))
- seq_show_option(seq, ss->legacy_name, NULL);
- if (root->flags & CGRP_ROOT_NOPREFIX)
- seq_puts(seq, ",noprefix");
- if (root->flags & CGRP_ROOT_XATTR)
- seq_puts(seq, ",xattr");
-
- spin_lock(&release_agent_path_lock);
- if (strlen(root->release_agent_path))
- seq_show_option(seq, "release_agent",
- root->release_agent_path);
- spin_unlock(&release_agent_path_lock);
-
- if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
- seq_puts(seq, ",clone_children");
- if (strlen(root->name))
- seq_show_option(seq, "name", root->name);
- return 0;
-}
-
-struct cgroup_sb_opts {
- u16 subsys_mask;
- unsigned int flags;
- char *release_agent;
- bool cpuset_clone_children;
- char *name;
- /* User explicitly requested empty subsystem */
- bool none;
-};
-
-static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
-{
- char *token, *o = data;
- bool all_ss = false, one_ss = false;
- u16 mask = U16_MAX;
- struct cgroup_subsys *ss;
- int nr_opts = 0;
- int i;
-
-#ifdef CONFIG_CPUSETS
- mask = ~((u16)1 << cpuset_cgrp_id);
-#endif
-
- memset(opts, 0, sizeof(*opts));
-
- while ((token = strsep(&o, ",")) != NULL) {
- nr_opts++;
-
- if (!*token)
- return -EINVAL;
- if (!strcmp(token, "none")) {
- /* Explicitly have no subsystems */
- opts->none = true;
- continue;
- }
- if (!strcmp(token, "all")) {
- /* Mutually exclusive option 'all' + subsystem name */
- if (one_ss)
- return -EINVAL;
- all_ss = true;
- continue;
- }
- if (!strcmp(token, "noprefix")) {
- opts->flags |= CGRP_ROOT_NOPREFIX;
- continue;
- }
- if (!strcmp(token, "clone_children")) {
- opts->cpuset_clone_children = true;
- continue;
- }
- if (!strcmp(token, "xattr")) {
- opts->flags |= CGRP_ROOT_XATTR;
- continue;
- }
- if (!strncmp(token, "release_agent=", 14)) {
- /* Specifying two release agents is forbidden */
- if (opts->release_agent)
- return -EINVAL;
- opts->release_agent =
- kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
- if (!opts->release_agent)
- return -ENOMEM;
- continue;
- }
- if (!strncmp(token, "name=", 5)) {
- const char *name = token + 5;
- /* Can't specify an empty name */
- if (!strlen(name))
- return -EINVAL;
- /* Must match [\w.-]+ */
- for (i = 0; i < strlen(name); i++) {
- char c = name[i];
- if (isalnum(c))
- continue;
- if ((c == '.') || (c == '-') || (c == '_'))
- continue;
- return -EINVAL;
- }
- /* Specifying two names is forbidden */
- if (opts->name)
- return -EINVAL;
- opts->name = kstrndup(name,
- MAX_CGROUP_ROOT_NAMELEN - 1,
- GFP_KERNEL);
- if (!opts->name)
- return -ENOMEM;
-
- continue;
- }
-
- for_each_subsys(ss, i) {
- if (strcmp(token, ss->legacy_name))
- continue;
- if (!cgroup_ssid_enabled(i))
- continue;
- if (cgroup_ssid_no_v1(i))
- continue;
-
- /* Mutually exclusive option 'all' + subsystem name */
- if (all_ss)
- return -EINVAL;
- opts->subsys_mask |= (1 << i);
- one_ss = true;
-
- break;
- }
- if (i == CGROUP_SUBSYS_COUNT)
- return -ENOENT;
- }
-
- /*
- * If the 'all' option was specified select all the subsystems,
- * otherwise if 'none', 'name=' and a subsystem name options were
- * not specified, let's default to 'all'
- */
- if (all_ss || (!one_ss && !opts->none && !opts->name))
- for_each_subsys(ss, i)
- if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
- opts->subsys_mask |= (1 << i);
-
- /*
- * We either have to specify by name or by subsystems. (So all
- * empty hierarchies must have a name).
- */
- if (!opts->subsys_mask && !opts->name)
- return -EINVAL;
-
- /*
- * Option noprefix was introduced just for backward compatibility
- * with the old cpuset, so we allow noprefix only if mounting just
- * the cpuset subsystem.
- */
- if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
- return -EINVAL;
-
- /* Can't specify "none" and some subsystems */
- if (opts->subsys_mask && opts->none)
- return -EINVAL;
-
- return 0;
-}
-
static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
{
- int ret = 0;
- struct cgroup_root *root = cgroup_root_from_kf(kf_root);
- struct cgroup_sb_opts opts;
- u16 added_mask, removed_mask;
-
- if (root == &cgrp_dfl_root) {
- pr_err("remount is not allowed\n");
- return -EINVAL;
- }
-
- cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
- /* See what subsystems are wanted */
- ret = parse_cgroupfs_options(data, &opts);
- if (ret)
- goto out_unlock;
-
- if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
- pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
- task_tgid_nr(current), current->comm);
-
- added_mask = opts.subsys_mask & ~root->subsys_mask;
- removed_mask = root->subsys_mask & ~opts.subsys_mask;
-
- /* Don't allow flags or name to change at remount */
- if ((opts.flags ^ root->flags) ||
- (opts.name && strcmp(opts.name, root->name))) {
- pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
- opts.flags, opts.name ?: "", root->flags, root->name);
- ret = -EINVAL;
- goto out_unlock;
- }
-
- /* remounting is not allowed for populated hierarchies */
- if (!list_empty(&root->cgrp.self.children)) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
- ret = rebind_subsystems(root, added_mask);
- if (ret)
- goto out_unlock;
-
- WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
-
- if (opts.release_agent) {
- spin_lock(&release_agent_path_lock);
- strcpy(root->release_agent_path, opts.release_agent);
- spin_unlock(&release_agent_path_lock);
- }
-
- trace_cgroup_remount(root);
-
- out_unlock:
- kfree(opts.release_agent);
- kfree(opts.name);
- mutex_unlock(&cgroup_mutex);
- return ret;
+ pr_err("remount is not allowed\n");
+ return -EINVAL;
}
/*
@@ -1964,11 +1618,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
init_waitqueue_head(&cgrp->offline_waitq);
- INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
+ INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
-static void init_cgroup_root(struct cgroup_root *root,
- struct cgroup_sb_opts *opts)
+void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
{
struct cgroup *cgrp = &root->cgrp;
@@ -1987,10 +1640,11 @@ static void init_cgroup_root(struct cgroup_root *root,
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
+int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
+ struct kernfs_syscall_ops *kf_sops;
struct css_set *cset;
int i, ret;
@@ -2022,7 +1676,10 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
if (ret)
goto cancel_ref;
- root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+ kf_sops = root == &cgrp_dfl_root ?
+ &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
+
+ root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED,
root_cgrp);
if (IS_ERR(root->kf_root)) {
@@ -2080,182 +1737,18 @@ out:
return ret;
}
-static struct dentry *cgroup_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data)
+struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
+ struct cgroup_root *root, unsigned long magic,
+ struct cgroup_namespace *ns)
{
- bool is_v2 = fs_type == &cgroup2_fs_type;
- struct super_block *pinned_sb = NULL;
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
- struct cgroup_subsys *ss;
- struct cgroup_root *root;
- struct cgroup_sb_opts opts;
struct dentry *dentry;
- int ret;
- int i;
bool new_sb;
- get_cgroup_ns(ns);
-
- /* Check if the caller has permission to mount. */
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
- put_cgroup_ns(ns);
- return ERR_PTR(-EPERM);
- }
-
- /*
- * The first time anyone tries to mount a cgroup, enable the list
- * linking each css_set to its tasks and fix up all existing tasks.
- */
- if (!use_task_css_set_links)
- cgroup_enable_task_cg_lists();
-
- if (is_v2) {
- if (data) {
- pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
- put_cgroup_ns(ns);
- return ERR_PTR(-EINVAL);
- }
- cgrp_dfl_visible = true;
- root = &cgrp_dfl_root;
- cgroup_get(&root->cgrp);
- goto out_mount;
- }
-
- cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
-
- /* First find the desired set of subsystems */
- ret = parse_cgroupfs_options(data, &opts);
- if (ret)
- goto out_unlock;
-
- /*
- * Destruction of cgroup root is asynchronous, so subsystems may
- * still be dying after the previous unmount. Let's drain the
- * dying subsystems. We just need to ensure that the ones
- * unmounted previously finish dying and don't care about new ones
- * starting. Testing ref liveliness is good enough.
- */
- for_each_subsys(ss, i) {
- if (!(opts.subsys_mask & (1 << i)) ||
- ss->root == &cgrp_dfl_root)
- continue;
-
- if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
- cgroup_put(&ss->root->cgrp);
- }
-
- for_each_root(root) {
- bool name_match = false;
-
- if (root == &cgrp_dfl_root)
- continue;
-
- /*
- * If we asked for a name then it must match. Also, if
- * name matches but sybsys_mask doesn't, we should fail.
- * Remember whether name matched.
- */
- if (opts.name) {
- if (strcmp(opts.name, root->name))
- continue;
- name_match = true;
- }
-
- /*
- * If we asked for subsystems (or explicitly for no
- * subsystems) then they must match.
- */
- if ((opts.subsys_mask || opts.none) &&
- (opts.subsys_mask != root->subsys_mask)) {
- if (!name_match)
- continue;
- ret = -EBUSY;
- goto out_unlock;
- }
-
- if (root->flags ^ opts.flags)
- pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-
- /*
- * We want to reuse @root whose lifetime is governed by its
- * ->cgrp. Let's check whether @root is alive and keep it
- * that way. As cgroup_kill_sb() can happen anytime, we
- * want to block it by pinning the sb so that @root doesn't
- * get killed before mount is complete.
- *
- * With the sb pinned, tryget_live can reliably indicate
- * whether @root can be reused. If it's being killed,
- * drain it. We can use wait_queue for the wait but this
- * path is super cold. Let's just sleep a bit and retry.
- */
- pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
- if (IS_ERR(pinned_sb) ||
- !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
- mutex_unlock(&cgroup_mutex);
- if (!IS_ERR_OR_NULL(pinned_sb))
- deactivate_super(pinned_sb);
- msleep(10);
- ret = restart_syscall();
- goto out_free;
- }
-
- ret = 0;
- goto out_unlock;
- }
+ dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
/*
- * No such thing, create a new one. name= matching without subsys
- * specification is allowed for already existing hierarchies but we
- * can't create new one without subsys specification.
- */
- if (!opts.subsys_mask && !opts.none) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
- /* Hierarchies may only be created in the initial cgroup namespace. */
- if (ns != &init_cgroup_ns) {
- ret = -EPERM;
- goto out_unlock;
- }
-
- root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root) {
- ret = -ENOMEM;
- goto out_unlock;
- }
-
- init_cgroup_root(root, &opts);
-
- ret = cgroup_setup_root(root, opts.subsys_mask);
- if (ret)
- cgroup_free_root(root);
-
-out_unlock:
- mutex_unlock(&cgroup_mutex);
-out_free:
- kfree(opts.release_agent);
- kfree(opts.name);
-
- if (ret) {
- put_cgroup_ns(ns);
- return ERR_PTR(ret);
- }
-out_mount:
- dentry = kernfs_mount(fs_type, flags, root->kf_root,
- is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
- &new_sb);
-
- /*
- * In non-init cgroup namespace, instead of root cgroup's
- * dentry, we return the dentry corresponding to the
- * cgroupns->root_cgrp.
+ * In non-init cgroup namespace, instead of root cgroup's dentry,
+ * we return the dentry corresponding to the cgroupns->root_cgrp.
*/
if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
struct dentry *nsdentry;
@@ -2277,13 +1770,45 @@ out_mount:
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
+ return dentry;
+}
+
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name,
+ void *data)
+{
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct dentry *dentry;
+
+ get_cgroup_ns(ns);
+
+ /* Check if the caller has permission to mount. */
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+ put_cgroup_ns(ns);
+ return ERR_PTR(-EPERM);
+ }
+
/*
- * If @pinned_sb, we're reusing an existing root and holding an
- * extra ref on its sb. Mount is complete. Put the extra ref.
+ * The first time anyone tries to mount a cgroup, enable the list
+ * linking each css_set to its tasks and fix up all existing tasks.
*/
- if (pinned_sb) {
- WARN_ON(new_sb);
- deactivate_super(pinned_sb);
+ if (!use_task_css_set_links)
+ cgroup_enable_task_cg_lists();
+
+ if (fs_type == &cgroup2_fs_type) {
+ if (data) {
+ pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+ put_cgroup_ns(ns);
+ return ERR_PTR(-EINVAL);
+ }
+ cgrp_dfl_visible = true;
+ cgroup_get(&cgrp_dfl_root.cgrp);
+
+ dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
+ CGROUP2_SUPER_MAGIC, ns);
+ } else {
+ dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
+ CGROUP_SUPER_MAGIC, ns);
}
put_cgroup_ns(ns);
@@ -2311,7 +1836,7 @@ static void cgroup_kill_sb(struct super_block *sb)
kernfs_kill_sb(sb);
}
-static struct file_system_type cgroup_fs_type = {
+struct file_system_type cgroup_fs_type = {
.name = "cgroup",
.mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
@@ -2325,8 +1850,8 @@ static struct file_system_type cgroup2_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
- struct cgroup_namespace *ns)
+int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns)
{
struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
@@ -2389,49 +1914,18 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
-/* used to track tasks and other necessary states during migration */
-struct cgroup_taskset {
- /* the src and dst cset list running through cset->mg_node */
- struct list_head src_csets;
- struct list_head dst_csets;
-
- /* the subsys currently being processed */
- int ssid;
-
- /*
- * Fields for cgroup_taskset_*() iteration.
- *
- * Before migration is committed, the target migration tasks are on
- * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
- * the csets on ->dst_csets. ->csets point to either ->src_csets
- * or ->dst_csets depending on whether migration is committed.
- *
- * ->cur_csets and ->cur_task point to the current task position
- * during iteration.
- */
- struct list_head *csets;
- struct css_set *cur_cset;
- struct task_struct *cur_task;
-};
-
-#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \
- .src_csets = LIST_HEAD_INIT(tset.src_csets), \
- .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
- .csets = &tset.src_csets, \
-}
-
/**
- * cgroup_taskset_add - try to add a migration target task to a taskset
+ * cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
- * @tset: target taskset
+ * @mgctx: target migration context
*
- * Add @task, which is a migration target, to @tset. This function becomes
- * noop if @task doesn't need to be migrated. @task's css_set should have
- * been added as a migration source and @task->cg_list will be moved from
- * the css_set's tasks list to mg_tasks one.
+ * Add @task, which is a migration target, to @mgctx->tset. This function
+ * becomes noop if @task doesn't need to be migrated. @task's css_set
+ * should have been added as a migration source and @task->cg_list will be
+ * moved from the css_set's tasks list to mg_tasks one.
*/
-static void cgroup_taskset_add(struct task_struct *task,
- struct cgroup_taskset *tset)
+static void cgroup_migrate_add_task(struct task_struct *task,
+ struct cgroup_mgctx *mgctx)
{
struct css_set *cset;
@@ -2451,10 +1945,11 @@ static void cgroup_taskset_add(struct task_struct *task,
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
- list_add_tail(&cset->mg_node, &tset->src_csets);
+ list_add_tail(&cset->mg_node,
+ &mgctx->tset.src_csets);
if (list_empty(&cset->mg_dst_cset->mg_node))
- list_move_tail(&cset->mg_dst_cset->mg_node,
- &tset->dst_csets);
+ list_add_tail(&cset->mg_dst_cset->mg_node,
+ &mgctx->tset.dst_csets);
}
/**
@@ -2521,17 +2016,16 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
/**
* cgroup_taskset_migrate - migrate a taskset
- * @tset: taget taskset
- * @root: cgroup root the migration is taking place on
+ * @mgctx: migration context
*
- * Migrate tasks in @tset as setup by migration preparation functions.
+ * Migrate tasks in @mgctx as setup by migration preparation functions.
* This function fails iff one of the ->can_attach callbacks fails and
- * guarantees that either all or none of the tasks in @tset are migrated.
- * @tset is consumed regardless of success.
+ * guarantees that either all or none of the tasks in @mgctx are migrated.
+ * @mgctx is consumed regardless of success.
*/
-static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
- struct cgroup_root *root)
+static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
{
+ struct cgroup_taskset *tset = &mgctx->tset;
struct cgroup_subsys *ss;
struct task_struct *task, *tmp_task;
struct css_set *cset, *tmp_cset;
@@ -2542,7 +2036,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
return 0;
/* check that we can legitimately attach to the cgroup */
- do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ss->can_attach) {
tset->ssid = ssid;
ret = ss->can_attach(tset);
@@ -2578,7 +2072,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
*/
tset->csets = &tset->dst_csets;
- do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ss->attach) {
tset->ssid = ssid;
ss->attach(tset);
@@ -2589,7 +2083,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
goto out_release_tset;
out_cancel_attach:
- do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
if (ssid == failed_ssid)
break;
if (ss->cancel_attach) {
@@ -2616,7 +2110,7 @@ out_release_tset:
* zero for migration destination cgroups with tasks so that child cgroups
* don't compete against tasks.
*/
-static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
{
return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
!dst_cgrp->subtree_control;
@@ -2624,25 +2118,31 @@ static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
/**
* cgroup_migrate_finish - cleanup after attach
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
*
* Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
* those functions for details.
*/
-static void cgroup_migrate_finish(struct list_head *preloaded_csets)
+void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
+ LIST_HEAD(preloaded);
struct css_set *cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+
+ list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
+ list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+
+ list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
put_css_set_locked(cset);
}
+
spin_unlock_irq(&css_set_lock);
}
@@ -2650,10 +2150,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
* cgroup_migrate_add_src - add a migration source css_set
* @src_cset: the source css_set to add
* @dst_cgrp: the destination cgroup
- * @preloaded_csets: list of preloaded css_sets
+ * @mgctx: migration context
*
* Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
- * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
* up by cgroup_migrate_finish().
*
* This function may be called without holding cgroup_threadgroup_rwsem
@@ -2662,9 +2162,9 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
* into play and the preloaded css_sets are guaranteed to cover all
* migrations.
*/
-static void cgroup_migrate_add_src(struct css_set *src_cset,
- struct cgroup *dst_cgrp,
- struct list_head *preloaded_csets)
+void cgroup_migrate_add_src(struct css_set *src_cset,
+ struct cgroup *dst_cgrp,
+ struct cgroup_mgctx *mgctx)
{
struct cgroup *src_cgrp;
@@ -2692,33 +2192,35 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
- list_add(&src_cset->mg_preload_node, preloaded_csets);
+ list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
}
/**
* cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @preloaded_csets: list of preloaded source css_sets
+ * @mgctx: migration context
*
* Tasks are about to be moved and all the source css_sets have been
- * preloaded to @preloaded_csets. This function looks up and pins all
- * destination css_sets, links each to its source, and append them to
- * @preloaded_csets.
+ * preloaded to @mgctx->preloaded_src_csets. This function looks up and
+ * pins all destination css_sets, links each to its source, and append them
+ * to @mgctx->preloaded_dst_csets.
*
* This function must be called after cgroup_migrate_add_src() has been
* called on each migration source css_set. After migration is performed
* using cgroup_migrate(), cgroup_migrate_finish() must be called on
- * @preloaded_csets.
+ * @mgctx.
*/
-static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
+int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
{
- LIST_HEAD(csets);
struct css_set *src_cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
/* look up the dst cset for each src cset and link it to src */
- list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
+ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_preload_node) {
struct css_set *dst_cset;
+ struct cgroup_subsys *ss;
+ int ssid;
dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
if (!dst_cset)
@@ -2743,15 +2245,19 @@ static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
src_cset->mg_dst_cset = dst_cset;
if (list_empty(&dst_cset->mg_preload_node))
- list_add(&dst_cset->mg_preload_node, &csets);
+ list_add_tail(&dst_cset->mg_preload_node,
+ &mgctx->preloaded_dst_csets);
else
put_css_set(dst_cset);
+
+ for_each_subsys(ss, ssid)
+ if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
+ mgctx->ss_mask |= 1 << ssid;
}
- list_splice_tail(&csets, preloaded_csets);
return 0;
err:
- cgroup_migrate_finish(&csets);
+ cgroup_migrate_finish(mgctx);
return -ENOMEM;
}
@@ -2759,7 +2265,7 @@ err:
* cgroup_migrate - migrate a process or task to a cgroup
* @leader: the leader of the process or the task to migrate
* @threadgroup: whether @leader points to the whole process or a single task
- * @root: cgroup root migration is taking place on
+ * @mgctx: migration context
*
* Migrate a process or task denoted by @leader. If migrating a process,
* the caller must be holding cgroup_threadgroup_rwsem. The caller is also
@@ -2773,10 +2279,9 @@ err:
* decided for all targets by invoking group_migrate_prepare_dst() before
* actually starting migrating.
*/
-static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
- struct cgroup_root *root)
+int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+ struct cgroup_mgctx *mgctx)
{
- struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
struct task_struct *task;
/*
@@ -2788,14 +2293,14 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
rcu_read_lock();
task = leader;
do {
- cgroup_taskset_add(task, &tset);
+ cgroup_migrate_add_task(task, mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
- return cgroup_taskset_migrate(&tset, root);
+ return cgroup_migrate_execute(mgctx);
}
/**
@@ -2806,10 +2311,10 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
*
* Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
*/
-static int cgroup_attach_task(struct cgroup *dst_cgrp,
- struct task_struct *leader, bool threadgroup)
+int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+ bool threadgroup)
{
- LIST_HEAD(preloaded_csets);
+ DEFINE_CGROUP_MGCTX(mgctx);
struct task_struct *task;
int ret;
@@ -2821,8 +2326,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
rcu_read_lock();
task = leader;
do {
- cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
- &preloaded_csets);
+ cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
@@ -2830,11 +2334,11 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
spin_unlock_irq(&css_set_lock);
/* prepare dst csets and commit */
- ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+ ret = cgroup_migrate_prepare_dst(&mgctx);
if (!ret)
- ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
+ ret = cgroup_migrate(leader, threadgroup, &mgctx);
- cgroup_migrate_finish(&preloaded_csets);
+ cgroup_migrate_finish(&mgctx);
if (!ret)
trace_cgroup_attach_task(dst_cgrp, leader, threadgroup);
@@ -2846,20 +2350,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
struct cgroup *dst_cgrp,
struct kernfs_open_file *of)
{
- const struct cred *cred = current_cred();
- const struct cred *tcred = get_task_cred(task);
int ret = 0;
- /*
- * even if we're attaching all tasks in the thread group, we only
- * need to check permissions on one of them.
- */
- if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
- !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid))
- ret = -EACCES;
-
- if (!ret && cgroup_on_dfl(dst_cgrp)) {
+ if (cgroup_on_dfl(dst_cgrp)) {
struct super_block *sb = of->file->f_path.dentry->d_sb;
struct cgroup *cgrp;
struct inode *inode;
@@ -2877,9 +2370,21 @@ static int cgroup_procs_write_permission(struct task_struct *task,
ret = inode_permission(inode, MAY_WRITE);
iput(inode);
}
+ } else {
+ const struct cred *cred = current_cred();
+ const struct cred *tcred = get_task_cred(task);
+
+ /*
+ * even if we're attaching all tasks in the thread group,
+ * we only need to check permissions on one of them.
+ */
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EACCES;
+ put_cred(tcred);
}
- put_cred(tcred);
return ret;
}
@@ -2888,8 +2393,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
* function to attach either it or all tasks in its threadgroup. Will lock
* cgroup_mutex and threadgroup.
*/
-static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, loff_t off, bool threadgroup)
+ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
struct cgroup_subsys *ss;
@@ -2920,11 +2425,12 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
tsk = tsk->group_leader;
/*
- * Workqueue threads may acquire PF_NO_SETAFFINITY and become
- * trapped in a cpuset, or RT worker may be born in a cgroup
- * with no rt_runtime allocated. Just say no.
+ * kthreads may acquire PF_NO_SETAFFINITY during initialization.
+ * If userland migrates such a kthread to a non-root cgroup, it can
+ * become trapped in a cpuset, or RT kthread may be born in a
+ * cgroup with no rt_runtime allocated. Just say no.
*/
- if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out_unlock_rcu;
}
@@ -2950,86 +2456,12 @@ out_unlock_threadgroup:
return ret ?: nbytes;
}
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
- struct cgroup_root *root;
- int retval = 0;
-
- mutex_lock(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
- for_each_root(root) {
- struct cgroup *from_cgrp;
-
- if (root == &cgrp_dfl_root)
- continue;
-
- spin_lock_irq(&css_set_lock);
- from_cgrp = task_cgroup_from_root(from, root);
- spin_unlock_irq(&css_set_lock);
-
- retval = cgroup_attach_task(from_cgrp, tsk, false);
- if (retval)
- break;
- }
- percpu_up_write(&cgroup_threadgroup_rwsem);
- mutex_unlock(&cgroup_mutex);
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-
-static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- return __cgroup_procs_write(of, buf, nbytes, off, false);
-}
-
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+ loff_t off)
{
return __cgroup_procs_write(of, buf, nbytes, off, true);
}
-static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct cgroup *cgrp;
-
- BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-
- cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!cgrp)
- return -ENODEV;
- spin_lock(&release_agent_path_lock);
- strlcpy(cgrp->root->release_agent_path, strstrip(buf),
- sizeof(cgrp->root->release_agent_path));
- spin_unlock(&release_agent_path_lock);
- cgroup_kn_unlock(of->kn);
- return nbytes;
-}
-
-static int cgroup_release_agent_show(struct seq_file *seq, void *v)
-{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- spin_lock(&release_agent_path_lock);
- seq_puts(seq, cgrp->root->release_agent_path);
- spin_unlock(&release_agent_path_lock);
- seq_putc(seq, '\n');
- return 0;
-}
-
-static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
-{
- seq_puts(seq, "0\n");
- return 0;
-}
-
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
{
struct cgroup_subsys *ss;
@@ -3075,8 +2507,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
*/
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
- LIST_HEAD(preloaded_csets);
- struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
+ DEFINE_CGROUP_MGCTX(mgctx);
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
@@ -3092,33 +2523,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgrp_cset_link *link;
list_for_each_entry(link, &dsct->cset_links, cset_link)
- cgroup_migrate_add_src(link->cset, dsct,
- &preloaded_csets);
+ cgroup_migrate_add_src(link->cset, dsct, &mgctx);
}
spin_unlock_irq(&css_set_lock);
/* NULL dst indicates self on default hierarchy */
- ret = cgroup_migrate_prepare_dst(&preloaded_csets);
+ ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_finish;
spin_lock_irq(&css_set_lock);
- list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+ list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
struct task_struct *task, *ntask;
- /* src_csets precede dst_csets, break on the first dst_cset */
- if (!src_cset->mg_src_cgrp)
- break;
-
/* all tasks in src_csets need to be migrated */
list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
- cgroup_taskset_add(task, &tset);
+ cgroup_migrate_add_task(task, &mgctx);
}
spin_unlock_irq(&css_set_lock);
- ret = cgroup_taskset_migrate(&tset, cgrp->root);
+ ret = cgroup_migrate_execute(&mgctx);
out_finish:
- cgroup_migrate_finish(&preloaded_csets);
+ cgroup_migrate_finish(&mgctx);
percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
@@ -3131,7 +2557,7 @@ out_finish:
* controller while the previous css is still around. This function grabs
* cgroup_mutex and drains the previous css instances of @cgrp's subtree.
*/
-static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
__acquires(&cgroup_mutex)
{
struct cgroup *dsct;
@@ -3244,7 +2670,7 @@ static bool css_visible(struct cgroup_subsys_state *css)
*
* Returns 0 on success, -errno on failure. On failure, csses which have
* been processed already aren't cleaned up. The caller is responsible for
- * cleaning up with cgroup_apply_control_disble().
+ * cleaning up with cgroup_apply_control_disable().
*/
static int cgroup_apply_control_enable(struct cgroup *cgrp)
{
@@ -3503,6 +2929,23 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
return 0;
}
+static int cgroup_file_open(struct kernfs_open_file *of)
+{
+ struct cftype *cft = of->kn->priv;
+
+ if (cft->open)
+ return cft->open(of);
+ return 0;
+}
+
+static void cgroup_file_release(struct kernfs_open_file *of)
+{
+ struct cftype *cft = of->kn->priv;
+
+ if (cft->release)
+ cft->release(of);
+}
+
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
@@ -3553,7 +2996,8 @@ static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
{
- seq_cft(seq)->seq_stop(seq, v);
+ if (seq_cft(seq)->seq_stop)
+ seq_cft(seq)->seq_stop(seq, v);
}
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -3575,12 +3019,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
static struct kernfs_ops cgroup_kf_single_ops = {
.atomic_write_len = PAGE_SIZE,
+ .open = cgroup_file_open,
+ .release = cgroup_file_release,
.write = cgroup_file_write,
.seq_show = cgroup_seqfile_show,
};
static struct kernfs_ops cgroup_kf_ops = {
.atomic_write_len = PAGE_SIZE,
+ .open = cgroup_file_open,
+ .release = cgroup_file_release,
.write = cgroup_file_write,
.seq_start = cgroup_seqfile_start,
.seq_next = cgroup_seqfile_next,
@@ -3588,48 +3036,6 @@ static struct kernfs_ops cgroup_kf_ops = {
.seq_show = cgroup_seqfile_show,
};
-/*
- * cgroup_rename - Only allow simple rename of directories in place.
- */
-static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
- const char *new_name_str)
-{
- struct cgroup *cgrp = kn->priv;
- int ret;
-
- if (kernfs_type(kn) != KERNFS_DIR)
- return -ENOTDIR;
- if (kn->parent != new_parent)
- return -EIO;
-
- /*
- * This isn't a proper migration and its usefulness is very
- * limited. Disallow on the default hierarchy.
- */
- if (cgroup_on_dfl(cgrp))
- return -EPERM;
-
- /*
- * We're gonna grab cgroup_mutex which nests outside kernfs
- * active_ref. kernfs_rename() doesn't require active_ref
- * protection. Break them before grabbing cgroup_mutex.
- */
- kernfs_break_active_protection(new_parent);
- kernfs_break_active_protection(kn);
-
- mutex_lock(&cgroup_mutex);
-
- ret = kernfs_rename(kn, new_parent, new_name_str);
- if (!ret)
- trace_cgroup_rename(cgrp);
-
- mutex_unlock(&cgroup_mutex);
-
- kernfs_unbreak_active_protection(kn);
- kernfs_unbreak_active_protection(new_parent);
- return ret;
-}
-
/* set uid and gid of cgroup dirs and files to that of the creator */
static int cgroup_kn_set_ugid(struct kernfs_node *kn)
{
@@ -3926,26 +3332,6 @@ void cgroup_file_notify(struct cgroup_file *cfile)
}
/**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup. The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
- */
-static int cgroup_task_count(const struct cgroup *cgrp)
-{
- int count = 0;
- struct cgrp_cset_link *link;
-
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &cgrp->cset_links, cset_link)
- count += atomic_read(&link->cset->refcount);
- spin_unlock_irq(&css_set_lock);
- return count;
-}
-
-/**
* css_next_child - find the next child of a given css
* @pos: the current position (%NULL to initiate traversal)
* @parent: css whose children to walk
@@ -4343,560 +3729,69 @@ void css_task_iter_end(struct css_task_iter *it)
put_task_struct(it->cur_task);
}
-/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
- * @to: cgroup to which the tasks will be moved
- * @from: cgroup in which the tasks currently reside
- *
- * Locking rules between cgroup_post_fork() and the migration path
- * guarantee that, if a task is forking while being migrated, the new child
- * is guaranteed to be either visible in the source cgroup after the
- * parent's migration is complete or put into the target cgroup. No task
- * can slip out of migration through forking.
- */
-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
-{
- LIST_HEAD(preloaded_csets);
- struct cgrp_cset_link *link;
- struct css_task_iter it;
- struct task_struct *task;
- int ret;
-
- if (!cgroup_may_migrate_to(to))
- return -EBUSY;
-
- mutex_lock(&cgroup_mutex);
-
- percpu_down_write(&cgroup_threadgroup_rwsem);
-
- /* all tasks in @from are being moved, all csets are source */
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &from->cset_links, cset_link)
- cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
- spin_unlock_irq(&css_set_lock);
-
- ret = cgroup_migrate_prepare_dst(&preloaded_csets);
- if (ret)
- goto out_err;
-
- /*
- * Migrate tasks one-by-one until @from is empty. This fails iff
- * ->can_attach() fails.
- */
- do {
- css_task_iter_start(&from->self, &it);
- task = css_task_iter_next(&it);
- if (task)
- get_task_struct(task);
- css_task_iter_end(&it);
-
- if (task) {
- ret = cgroup_migrate(task, false, to->root);
- if (!ret)
- trace_cgroup_transfer_tasks(to, task, false);
- put_task_struct(task);
- }
- } while (task && !ret);
-out_err:
- cgroup_migrate_finish(&preloaded_csets);
- percpu_up_write(&cgroup_threadgroup_rwsem);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-/*
- * Stuff for reading the 'tasks'/'procs' files.
- *
- * Reading this file can return large amounts of data if a cgroup has
- * *lots* of attached tasks. So it may need several calls to read(),
- * but we cannot guarantee that the information we produce is correct
- * unless we produce it entirely atomically.
- *
- */
-
-/* which pidlist file are we talking about? */
-enum cgroup_filetype {
- CGROUP_FILE_PROCS,
- CGROUP_FILE_TASKS,
-};
-
-/*
- * A pidlist is a list of pids that virtually represents the contents of one
- * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
- * a pair (one each for procs, tasks) for each pid namespace that's relevant
- * to the cgroup.
- */
-struct cgroup_pidlist {
- /*
- * used to find which pidlist is wanted. doesn't change as long as
- * this particular list stays in the list.
- */
- struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
- /* array of xids */
- pid_t *list;
- /* how many elements the above list has */
- int length;
- /* each of these stored in a list by its cgroup */
- struct list_head links;
- /* pointer to the cgroup we belong to, for list removal purposes */
- struct cgroup *owner;
- /* for delayed destruction */
- struct delayed_work destroy_dwork;
-};
-
-/*
- * The following two functions "fix" the issue where there are more pids
- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
- * TODO: replace with a kernel-wide solution to this problem
- */
-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
-static void *pidlist_allocate(int count)
-{
- if (PIDLIST_TOO_LARGE(count))
- return vmalloc(count * sizeof(pid_t));
- else
- return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
-}
-
-static void pidlist_free(void *p)
-{
- kvfree(p);
-}
-
-/*
- * Used to destroy all pidlists lingering waiting for destroy timer. None
- * should be left afterwards.
- */
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
-{
- struct cgroup_pidlist *l, *tmp_l;
-
- mutex_lock(&cgrp->pidlist_mutex);
- list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
- mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
- mutex_unlock(&cgrp->pidlist_mutex);
-
- flush_workqueue(cgroup_pidlist_destroy_wq);
- BUG_ON(!list_empty(&cgrp->pidlists));
-}
-
-static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
-{
- struct delayed_work *dwork = to_delayed_work(work);
- struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
- destroy_dwork);
- struct cgroup_pidlist *tofree = NULL;
-
- mutex_lock(&l->owner->pidlist_mutex);
-
- /*
- * Destroy iff we didn't get queued again. The state won't change
- * as destroy_dwork can only be queued while locked.
- */
- if (!delayed_work_pending(dwork)) {
- list_del(&l->links);
- pidlist_free(l->list);
- put_pid_ns(l->key.ns);
- tofree = l;
- }
-
- mutex_unlock(&l->owner->pidlist_mutex);
- kfree(tofree);
-}
-
-/*
- * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * Returns the number of unique elements.
- */
-static int pidlist_uniq(pid_t *list, int length)
-{
- int src, dest = 1;
-
- /*
- * we presume the 0th element is unique, so i starts at 1. trivial
- * edge cases first; no work needs to be done for either
- */
- if (length == 0 || length == 1)
- return length;
- /* src and dest walk down the list; dest counts unique elements */
- for (src = 1; src < length; src++) {
- /* find next unique element */
- while (list[src] == list[src-1]) {
- src++;
- if (src == length)
- goto after;
- }
- /* dest always points to where the next unique element goes */
- list[dest] = list[src];
- dest++;
- }
-after:
- return dest;
-}
-
-/*
- * The two pid files - task and cgroup.procs - guaranteed that the result
- * is sorted, which forced this whole pidlist fiasco. As pid order is
- * different per namespace, each namespace needs differently sorted list,
- * making it impossible to use, for example, single rbtree of member tasks
- * sorted by task pointer. As pidlists can be fairly large, allocating one
- * per open file is dangerous, so cgroup had to implement shared pool of
- * pidlists keyed by cgroup and namespace.
- *
- * All this extra complexity was caused by the original implementation
- * committing to an entirely unnecessary property. In the long term, we
- * want to do away with it. Explicitly scramble sort order if on the
- * default hierarchy so that no such expectation exists in the new
- * interface.
- *
- * Scrambling is done by swapping every two consecutive bits, which is
- * non-identity one-to-one mapping which disturbs sort order sufficiently.
- */
-static pid_t pid_fry(pid_t pid)
+static void cgroup_procs_release(struct kernfs_open_file *of)
{
- unsigned a = pid & 0x55555555;
- unsigned b = pid & 0xAAAAAAAA;
-
- return (a << 1) | (b >> 1);
-}
-
-static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
-{
- if (cgroup_on_dfl(cgrp))
- return pid_fry(pid);
- else
- return pid;
-}
-
-static int cmppid(const void *a, const void *b)
-{
- return *(pid_t *)a - *(pid_t *)b;
-}
-
-static int fried_cmppid(const void *a, const void *b)
-{
- return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
-}
-
-static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
- enum cgroup_filetype type)
-{
- struct cgroup_pidlist *l;
- /* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = task_active_pid_ns(current);
-
- lockdep_assert_held(&cgrp->pidlist_mutex);
-
- list_for_each_entry(l, &cgrp->pidlists, links)
- if (l->key.type == type && l->key.ns == ns)
- return l;
- return NULL;
-}
-
-/*
- * find the appropriate pidlist for our purpose (given procs vs tasks)
- * returns with the lock on that pidlist already held, and takes care
- * of the use count, or returns NULL with no locks held if we're out of
- * memory.
- */
-static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
- enum cgroup_filetype type)
-{
- struct cgroup_pidlist *l;
-
- lockdep_assert_held(&cgrp->pidlist_mutex);
-
- l = cgroup_pidlist_find(cgrp, type);
- if (l)
- return l;
-
- /* entry not found; create a new one */
- l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
- if (!l)
- return l;
-
- INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
- l->key.type = type;
- /* don't need task_nsproxy() if we're looking at ourself */
- l->key.ns = get_pid_ns(task_active_pid_ns(current));
- l->owner = cgrp;
- list_add(&l->links, &cgrp->pidlists);
- return l;
-}
-
-/*
- * Load a cgroup's pidarray with either procs' tgids or tasks' pids
- */
-static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
- struct cgroup_pidlist **lp)
-{
- pid_t *array;
- int length;
- int pid, n = 0; /* used for populating the array */
- struct css_task_iter it;
- struct task_struct *tsk;
- struct cgroup_pidlist *l;
-
- lockdep_assert_held(&cgrp->pidlist_mutex);
-
- /*
- * If cgroup gets more users after we read count, we won't have
- * enough space - tough. This race is indistinguishable to the
- * caller from the case that the additional cgroup users didn't
- * show up until sometime later on.
- */
- length = cgroup_task_count(cgrp);
- array = pidlist_allocate(length);
- if (!array)
- return -ENOMEM;
- /* now, populate the array */
- css_task_iter_start(&cgrp->self, &it);
- while ((tsk = css_task_iter_next(&it))) {
- if (unlikely(n == length))
- break;
- /* get tgid or pid for procs or tasks file respectively */
- if (type == CGROUP_FILE_PROCS)
- pid = task_tgid_vnr(tsk);
- else
- pid = task_pid_vnr(tsk);
- if (pid > 0) /* make sure to only use valid results */
- array[n++] = pid;
- }
- css_task_iter_end(&it);
- length = n;
- /* now sort & (if procs) strip out duplicates */
- if (cgroup_on_dfl(cgrp))
- sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
- else
- sort(array, length, sizeof(pid_t), cmppid, NULL);
- if (type == CGROUP_FILE_PROCS)
- length = pidlist_uniq(array, length);
-
- l = cgroup_pidlist_find_create(cgrp, type);
- if (!l) {
- pidlist_free(array);
- return -ENOMEM;
+ if (of->priv) {
+ css_task_iter_end(of->priv);
+ kfree(of->priv);
}
-
- /* store array, freeing old if necessary */
- pidlist_free(l->list);
- l->list = array;
- l->length = length;
- *lp = l;
- return 0;
}
-/**
- * cgroupstats_build - build and fill cgroupstats
- * @stats: cgroupstats to fill information into
- * @dentry: A dentry entry belonging to the cgroup for which stats have
- * been requested.
- *
- * Build and fill cgroupstats so that taskstats can export it to user
- * space.
- */
-int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
+static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
- struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
- struct cgroup *cgrp;
- struct css_task_iter it;
- struct task_struct *tsk;
-
- /* it should be kernfs_node belonging to cgroupfs and is a directory */
- if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
- kernfs_type(kn) != KERNFS_DIR)
- return -EINVAL;
-
- mutex_lock(&cgroup_mutex);
-
- /*
- * We aren't being called from kernfs and there's no guarantee on
- * @kn->priv's validity. For this and css_tryget_online_from_dir(),
- * @kn->priv is RCU safe. Let's do the RCU dancing.
- */
- rcu_read_lock();
- cgrp = rcu_dereference(kn->priv);
- if (!cgrp || cgroup_is_dead(cgrp)) {
- rcu_read_unlock();
- mutex_unlock(&cgroup_mutex);
- return -ENOENT;
- }
- rcu_read_unlock();
+ struct kernfs_open_file *of = s->private;
+ struct css_task_iter *it = of->priv;
+ struct task_struct *task;
- css_task_iter_start(&cgrp->self, &it);
- while ((tsk = css_task_iter_next(&it))) {
- switch (tsk->state) {
- case TASK_RUNNING:
- stats->nr_running++;
- break;
- case TASK_INTERRUPTIBLE:
- stats->nr_sleeping++;
- break;
- case TASK_UNINTERRUPTIBLE:
- stats->nr_uninterruptible++;
- break;
- case TASK_STOPPED:
- stats->nr_stopped++;
- break;
- default:
- if (delayacct_is_task_waiting_on_io(tsk))
- stats->nr_io_wait++;
- break;
- }
- }
- css_task_iter_end(&it);
+ do {
+ task = css_task_iter_next(it);
+ } while (task && !thread_group_leader(task));
- mutex_unlock(&cgroup_mutex);
- return 0;
+ return task;
}
-
-/*
- * seq_file methods for the tasks/procs files. The seq_file position is the
- * next pid to display; the seq_file iterator is a pointer to the pid
- * in the cgroup->l->list array.
- */
-
-static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
{
- /*
- * Initially we receive a position value that corresponds to
- * one more than the last pid shown (or 0 on the first call or
- * after a seek to the start). Use a binary-search to find the
- * next pid to display, if any
- */
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
- struct cgroup_pidlist *l;
- enum cgroup_filetype type = seq_cft(s)->private;
- int index = 0, pid = *pos;
- int *iter, ret;
-
- mutex_lock(&cgrp->pidlist_mutex);
+ struct css_task_iter *it = of->priv;
/*
- * !NULL @of->priv indicates that this isn't the first start()
- * after open. If the matching pidlist is around, we can use that.
- * Look for it. Note that @of->priv can't be used directly. It
- * could already have been destroyed.
+ * When a seq_file is seeked, it's always traversed sequentially
+ * from position 0, so we can simply keep iterating on !0 *pos.
*/
- if (of->priv)
- of->priv = cgroup_pidlist_find(cgrp, type);
-
- /*
- * Either this is the first start() after open or the matching
- * pidlist has been destroyed inbetween. Create a new one.
- */
- if (!of->priv) {
- ret = pidlist_array_load(cgrp, type,
- (struct cgroup_pidlist **)&of->priv);
- if (ret)
- return ERR_PTR(ret);
- }
- l = of->priv;
-
- if (pid) {
- int end = l->length;
-
- while (index < end) {
- int mid = (index + end) / 2;
- if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
- index = mid;
- break;
- } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
- index = mid + 1;
- else
- end = mid;
- }
- }
- /* If we're off the end of the array, we're done */
- if (index >= l->length)
- return NULL;
- /* Update the abstract position to be the actual pid that we found */
- iter = l->list + index;
- *pos = cgroup_pid_fry(cgrp, *iter);
- return iter;
-}
-
-static void cgroup_pidlist_stop(struct seq_file *s, void *v)
-{
- struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
-
- if (l)
- mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
- CGROUP_PIDLIST_DESTROY_DELAY);
- mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
-}
+ if (!it) {
+ if (WARN_ON_ONCE((*pos)++))
+ return ERR_PTR(-EINVAL);
-static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
-{
- struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
- pid_t *p = v;
- pid_t *end = l->list + l->length;
- /*
- * Advance to the next pid in the array. If this goes off the
- * end, we're done
- */
- p++;
- if (p >= end) {
- return NULL;
- } else {
- *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
- return p;
+ it = kzalloc(sizeof(*it), GFP_KERNEL);
+ if (!it)
+ return ERR_PTR(-ENOMEM);
+ of->priv = it;
+ css_task_iter_start(&cgrp->self, it);
+ } else if (!(*pos)++) {
+ css_task_iter_end(it);
+ css_task_iter_start(&cgrp->self, it);
}
-}
-
-static int cgroup_pidlist_show(struct seq_file *s, void *v)
-{
- seq_printf(s, "%d\n", *(int *)v);
- return 0;
+ return cgroup_procs_next(s, NULL, NULL);
}
-static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
- struct cftype *cft)
+static int cgroup_procs_show(struct seq_file *s, void *v)
{
- return notify_on_release(css->cgroup);
-}
-
-static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 val)
-{
- if (val)
- set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
- else
- clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
- return 0;
-}
-
-static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
-}
-
-static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 val)
-{
- if (val)
- set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
- else
- clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
+ seq_printf(s, "%d\n", task_tgid_vnr(v));
return 0;
}
/* cgroup core interface files for the default hierarchy */
-static struct cftype cgroup_dfl_base_files[] = {
+static struct cftype cgroup_base_files[] = {
{
.name = "cgroup.procs",
.file_offset = offsetof(struct cgroup, procs_file),
- .seq_start = cgroup_pidlist_start,
- .seq_next = cgroup_pidlist_next,
- .seq_stop = cgroup_pidlist_stop,
- .seq_show = cgroup_pidlist_show,
- .private = CGROUP_FILE_PROCS,
+ .release = cgroup_procs_release,
+ .seq_start = cgroup_procs_start,
+ .seq_next = cgroup_procs_next,
+ .seq_show = cgroup_procs_show,
.write = cgroup_procs_write,
},
{
@@ -4917,51 +3812,6 @@ static struct cftype cgroup_dfl_base_files[] = {
{ } /* terminate */
};
-/* cgroup core interface files for the legacy hierarchies */
-static struct cftype cgroup_legacy_base_files[] = {
- {
- .name = "cgroup.procs",
- .seq_start = cgroup_pidlist_start,
- .seq_next = cgroup_pidlist_next,
- .seq_stop = cgroup_pidlist_stop,
- .seq_show = cgroup_pidlist_show,
- .private = CGROUP_FILE_PROCS,
- .write = cgroup_procs_write,
- },
- {
- .name = "cgroup.clone_children",
- .read_u64 = cgroup_clone_children_read,
- .write_u64 = cgroup_clone_children_write,
- },
- {
- .name = "cgroup.sane_behavior",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_sane_behavior_show,
- },
- {
- .name = "tasks",
- .seq_start = cgroup_pidlist_start,
- .seq_next = cgroup_pidlist_next,
- .seq_stop = cgroup_pidlist_stop,
- .seq_show = cgroup_pidlist_show,
- .private = CGROUP_FILE_TASKS,
- .write = cgroup_tasks_write,
- },
- {
- .name = "notify_on_release",
- .read_u64 = cgroup_read_notify_on_release,
- .write_u64 = cgroup_write_notify_on_release,
- },
- {
- .name = "release_agent",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_release_agent_show,
- .write = cgroup_release_agent_write,
- .max_write_len = PATH_MAX - 1,
- },
- { } /* terminate */
-};
-
/*
* css destruction is four-stage process.
*
@@ -5007,7 +3857,7 @@ static void css_free_work_fn(struct work_struct *work)
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
- cgroup_pidlist_destroy_all(cgrp);
+ cgroup1_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
if (cgroup_parent(cgrp)) {
@@ -5302,8 +4152,7 @@ out_free_cgrp:
return ERR_PTR(ret);
}
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
+int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
{
struct cgroup *parent, *cgrp;
struct kernfs_node *kn;
@@ -5507,7 +4356,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
*/
kernfs_remove(cgrp->kn);
- check_for_release(cgroup_parent(cgrp));
+ cgroup1_check_for_release(cgroup_parent(cgrp));
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -5515,7 +4364,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
return 0;
};
-static int cgroup_rmdir(struct kernfs_node *kn)
+int cgroup_rmdir(struct kernfs_node *kn)
{
struct cgroup *cgrp;
int ret = 0;
@@ -5535,10 +4384,8 @@ static int cgroup_rmdir(struct kernfs_node *kn)
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
.remount_fs = cgroup_remount,
- .show_options = cgroup_show_options,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
- .rename = cgroup_rename,
.show_path = cgroup_show_path,
};
@@ -5646,8 +4493,8 @@ int __init cgroup_init(void)
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
- BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
- BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
/*
* The latency of the synchronize_sched() is too high for cgroups,
@@ -5697,7 +4544,7 @@ int __init cgroup_init(void)
continue;
}
- if (cgroup_ssid_no_v1(ssid))
+ if (cgroup1_ssid_disabled(ssid))
printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
ss->name);
@@ -5744,15 +4591,6 @@ static int __init cgroup_wq_init(void)
*/
cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
BUG_ON(!cgroup_destroy_wq);
-
- /*
- * Used to destroy pidlists and separate to serve as flush domain.
- * Cap @max_active to 1 too.
- */
- cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
- 0, 1);
- BUG_ON(!cgroup_pidlist_destroy_wq);
-
return 0;
}
core_initcall(cgroup_wq_init);
@@ -5835,42 +4673,6 @@ out:
return retval;
}
-/* Display information about each subsystem and each hierarchy */
-static int proc_cgroupstats_show(struct seq_file *m, void *v)
-{
- struct cgroup_subsys *ss;
- int i;
-
- seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
- /*
- * ideally we don't want subsystems moving around while we do this.
- * cgroup_mutex is also necessary to guarantee an atomic snapshot of
- * subsys/hierarchy state.
- */
- mutex_lock(&cgroup_mutex);
-
- for_each_subsys(ss, i)
- seq_printf(m, "%s\t%d\t%d\t%d\n",
- ss->legacy_name, ss->root->hierarchy_id,
- atomic_read(&ss->root->nr_cgrps),
- cgroup_ssid_enabled(i));
-
- mutex_unlock(&cgroup_mutex);
- return 0;
-}
-
-static int cgroupstats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, proc_cgroupstats_show, NULL);
-}
-
-static const struct file_operations proc_cgroupstats_operations = {
- .open = cgroupstats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
@@ -6050,76 +4852,6 @@ void cgroup_free(struct task_struct *task)
put_css_set(cset);
}
-static void check_for_release(struct cgroup *cgrp)
-{
- if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
- !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
- schedule_work(&cgrp->release_agent_work);
-}
-
-/*
- * Notify userspace when a cgroup is released, by running the
- * configured release agent with the name of the cgroup (path
- * relative to the root of cgroup file system) as the argument.
- *
- * Most likely, this user command will try to rmdir this cgroup.
- *
- * This races with the possibility that some other task will be
- * attached to this cgroup before it is removed, or that some other
- * user task will 'mkdir' a child cgroup of this cgroup. That's ok.
- * The presumed 'rmdir' will fail quietly if this cgroup is no longer
- * unused, and this cgroup will be reprieved from its death sentence,
- * to continue to serve a useful existence. Next time it's released,
- * we will get notified again, if it still has 'notify_on_release' set.
- *
- * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
- * means only wait until the task is successfully execve()'d. The
- * separate release agent task is forked by call_usermodehelper(),
- * then control in this thread returns here, without waiting for the
- * release agent task. We don't bother to wait because the caller of
- * this routine has no use for the exit status of the release agent
- * task, so no sense holding our caller up for that.
- */
-static void cgroup_release_agent(struct work_struct *work)
-{
- struct cgroup *cgrp =
- container_of(work, struct cgroup, release_agent_work);
- char *pathbuf = NULL, *agentbuf = NULL;
- char *argv[3], *envp[3];
- int ret;
-
- mutex_lock(&cgroup_mutex);
-
- pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
- agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!pathbuf || !agentbuf)
- goto out;
-
- spin_lock_irq(&css_set_lock);
- ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
- spin_unlock_irq(&css_set_lock);
- if (ret < 0 || ret >= PATH_MAX)
- goto out;
-
- argv[0] = agentbuf;
- argv[1] = pathbuf;
- argv[2] = NULL;
-
- /* minimal command environment */
- envp[0] = "HOME=/";
- envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[2] = NULL;
-
- mutex_unlock(&cgroup_mutex);
- call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
- goto out_free;
-out:
- mutex_unlock(&cgroup_mutex);
-out_free:
- kfree(agentbuf);
- kfree(pathbuf);
-}
-
static int __init cgroup_disable(char *str)
{
struct cgroup_subsys *ss;
@@ -6141,33 +4873,6 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
-static int __init cgroup_no_v1(char *str)
-{
- struct cgroup_subsys *ss;
- char *token;
- int i;
-
- while ((token = strsep(&str, ",")) != NULL) {
- if (!*token)
- continue;
-
- if (!strcmp(token, "all")) {
- cgroup_no_v1_mask = U16_MAX;
- break;
- }
-
- for_each_subsys(ss, i) {
- if (strcmp(token, ss->name) &&
- strcmp(token, ss->legacy_name))
- continue;
-
- cgroup_no_v1_mask |= 1 << i;
- }
- }
- return 1;
-}
-__setup("cgroup_no_v1=", cgroup_no_v1);
-
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -6197,7 +4902,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
* have been or be removed at any point. @kn->priv is RCU
* protected for this access. See css_release_work_fn() for details.
*/
- cgrp = rcu_dereference(kn->priv);
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
if (cgrp)
css = cgroup_css(cgrp, ss);
@@ -6349,154 +5054,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */
-/* cgroup namespaces */
-
-static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
-{
- return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
-}
-
-static void dec_cgroup_namespaces(struct ucounts *ucounts)
-{
- dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
-}
-
-static struct cgroup_namespace *alloc_cgroup_ns(void)
-{
- struct cgroup_namespace *new_ns;
- int ret;
-
- new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
- if (!new_ns)
- return ERR_PTR(-ENOMEM);
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
- return ERR_PTR(ret);
- }
- atomic_set(&new_ns->count, 1);
- new_ns->ns.ops = &cgroupns_operations;
- return new_ns;
-}
-
-void free_cgroup_ns(struct cgroup_namespace *ns)
-{
- put_css_set(ns->root_cset);
- dec_cgroup_namespaces(ns->ucounts);
- put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
- kfree(ns);
-}
-EXPORT_SYMBOL(free_cgroup_ns);
-
-struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
- struct user_namespace *user_ns,
- struct cgroup_namespace *old_ns)
-{
- struct cgroup_namespace *new_ns;
- struct ucounts *ucounts;
- struct css_set *cset;
-
- BUG_ON(!old_ns);
-
- if (!(flags & CLONE_NEWCGROUP)) {
- get_cgroup_ns(old_ns);
- return old_ns;
- }
-
- /* Allow only sysadmin to create cgroup namespace. */
- if (!ns_capable(user_ns, CAP_SYS_ADMIN))
- return ERR_PTR(-EPERM);
-
- ucounts = inc_cgroup_namespaces(user_ns);
- if (!ucounts)
- return ERR_PTR(-ENOSPC);
-
- /* It is not safe to take cgroup_mutex here */
- spin_lock_irq(&css_set_lock);
- cset = task_css_set(current);
- get_css_set(cset);
- spin_unlock_irq(&css_set_lock);
-
- new_ns = alloc_cgroup_ns();
- if (IS_ERR(new_ns)) {
- put_css_set(cset);
- dec_cgroup_namespaces(ucounts);
- return new_ns;
- }
-
- new_ns->user_ns = get_user_ns(user_ns);
- new_ns->ucounts = ucounts;
- new_ns->root_cset = cset;
-
- return new_ns;
-}
-
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
- return container_of(ns, struct cgroup_namespace, ns);
-}
-
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
-{
- struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
-
- if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
- !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
- return -EPERM;
-
- /* Don't need to do anything if we are attaching to our own cgroupns. */
- if (cgroup_ns == nsproxy->cgroup_ns)
- return 0;
-
- get_cgroup_ns(cgroup_ns);
- put_cgroup_ns(nsproxy->cgroup_ns);
- nsproxy->cgroup_ns = cgroup_ns;
-
- return 0;
-}
-
-static struct ns_common *cgroupns_get(struct task_struct *task)
-{
- struct cgroup_namespace *ns = NULL;
- struct nsproxy *nsproxy;
-
- task_lock(task);
- nsproxy = task->nsproxy;
- if (nsproxy) {
- ns = nsproxy->cgroup_ns;
- get_cgroup_ns(ns);
- }
- task_unlock(task);
-
- return ns ? &ns->ns : NULL;
-}
-
-static void cgroupns_put(struct ns_common *ns)
-{
- put_cgroup_ns(to_cg_ns(ns));
-}
-
-static struct user_namespace *cgroupns_owner(struct ns_common *ns)
-{
- return to_cg_ns(ns)->user_ns;
-}
-
-const struct proc_ns_operations cgroupns_operations = {
- .name = "cgroup",
- .type = CLONE_NEWCGROUP,
- .get = cgroupns_get,
- .put = cgroupns_put,
- .install = cgroupns_install,
- .owner = cgroupns_owner,
-};
-
-static __init int cgroup_namespaces_init(void)
-{
- return 0;
-}
-subsys_initcall(cgroup_namespaces_init);
-
#ifdef CONFIG_CGROUP_BPF
int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type, bool overridable)
@@ -6510,149 +5067,3 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
return ret;
}
#endif /* CONFIG_CGROUP_BPF */
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
- struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
- if (!css)
- return ERR_PTR(-ENOMEM);
-
- return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
- kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- u64 count;
-
- rcu_read_lock();
- count = atomic_read(&task_css_set(current)->refcount);
- rcu_read_unlock();
- return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
- struct cgrp_cset_link *link;
- struct css_set *cset;
- char *name_buf;
-
- name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
- if (!name_buf)
- return -ENOMEM;
-
- spin_lock_irq(&css_set_lock);
- rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
-
- cgroup_name(c, name_buf, NAME_MAX + 1);
- seq_printf(seq, "Root %d group %s\n",
- c->root->hierarchy_id, name_buf);
- }
- rcu_read_unlock();
- spin_unlock_irq(&css_set_lock);
- kfree(name_buf);
- return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
- struct cgroup_subsys_state *css = seq_css(seq);
- struct cgrp_cset_link *link;
-
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
- struct css_set *cset = link->cset;
- struct task_struct *task;
- int count = 0;
-
- seq_printf(seq, "css_set %p\n", cset);
-
- list_for_each_entry(task, &cset->tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
-
- list_for_each_entry(task, &cset->mg_tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
- continue;
- overflow:
- seq_puts(seq, " ...\n");
- }
- spin_unlock_irq(&css_set_lock);
- return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- return (!cgroup_is_populated(css->cgroup) &&
- !css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] = {
- {
- .name = "taskcount",
- .read_u64 = debug_taskcount_read,
- },
-
- {
- .name = "current_css_set",
- .read_u64 = current_css_set_read,
- },
-
- {
- .name = "current_css_set_refcount",
- .read_u64 = current_css_set_refcount_read,
- },
-
- {
- .name = "current_css_set_cg_links",
- .seq_show = current_css_set_cg_links_read,
- },
-
- {
- .name = "cgroup_css_links",
- .seq_show = cgroup_css_links_read,
- },
-
- {
- .name = "releasable",
- .read_u64 = releasable_read,
- },
-
- { } /* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
- .css_alloc = debug_css_alloc,
- .css_free = debug_css_free,
- .legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cpuset.c b/kernel/cgroup/cpuset.c
index b3088886cd37..0f41292be0fb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -44,6 +44,8 @@
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
#include <linux/seq_file.h>
#include <linux/security.h>
#include <linux/slab.h>
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup/freezer.c
index 1b72d56edce5..1b72d56edce5 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup/freezer.c
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
new file mode 100644
index 000000000000..96d38dab6fb2
--- /dev/null
+++ b/kernel/cgroup/namespace.c
@@ -0,0 +1,155 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
+
+
+/* cgroup namespaces */
+
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+ struct cgroup_namespace *new_ns;
+ int ret;
+
+ new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+ if (!new_ns)
+ return ERR_PTR(-ENOMEM);
+ ret = ns_alloc_inum(&new_ns->ns);
+ if (ret) {
+ kfree(new_ns);
+ return ERR_PTR(ret);
+ }
+ atomic_set(&new_ns->count, 1);
+ new_ns->ns.ops = &cgroupns_operations;
+ return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+ put_css_set(ns->root_cset);
+ dec_cgroup_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+ ns_free_inum(&ns->ns);
+ kfree(ns);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns)
+{
+ struct cgroup_namespace *new_ns;
+ struct ucounts *ucounts;
+ struct css_set *cset;
+
+ BUG_ON(!old_ns);
+
+ if (!(flags & CLONE_NEWCGROUP)) {
+ get_cgroup_ns(old_ns);
+ return old_ns;
+ }
+
+ /* Allow only sysadmin to create cgroup namespace. */
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ ucounts = inc_cgroup_namespaces(user_ns);
+ if (!ucounts)
+ return ERR_PTR(-ENOSPC);
+
+ /* It is not safe to take cgroup_mutex here */
+ spin_lock_irq(&css_set_lock);
+ cset = task_css_set(current);
+ get_css_set(cset);
+ spin_unlock_irq(&css_set_lock);
+
+ new_ns = alloc_cgroup_ns();
+ if (IS_ERR(new_ns)) {
+ put_css_set(cset);
+ dec_cgroup_namespaces(ucounts);
+ return new_ns;
+ }
+
+ new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->ucounts = ucounts;
+ new_ns->root_cset = cset;
+
+ return new_ns;
+}
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct cgroup_namespace, ns);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+ struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+ !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* Don't need to do anything if we are attaching to our own cgroupns. */
+ if (cgroup_ns == nsproxy->cgroup_ns)
+ return 0;
+
+ get_cgroup_ns(cgroup_ns);
+ put_cgroup_ns(nsproxy->cgroup_ns);
+ nsproxy->cgroup_ns = cgroup_ns;
+
+ return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+ struct cgroup_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->cgroup_ns;
+ get_cgroup_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+ put_cgroup_ns(to_cg_ns(ns));
+}
+
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+ return to_cg_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+ .name = "cgroup",
+ .type = CLONE_NEWCGROUP,
+ .get = cgroupns_get,
+ .put = cgroupns_put,
+ .install = cgroupns_install,
+ .owner = cgroupns_owner,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+ return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup/pids.c
index 2bd673783f1a..2237201d66d5 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup/pids.c
@@ -214,7 +214,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
/*
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
- * on threadgroup_change_begin() held by the copy_process().
+ * on cgroup_threadgroup_change_begin() held by the copy_process().
*/
static int pids_can_fork(struct task_struct *task)
{
@@ -229,7 +229,7 @@ static int pids_can_fork(struct task_struct *task)
/* Only log the first time events_limit is incremented. */
if (atomic64_inc_return(&pids->events_limit) == 1) {
pr_info("cgroup: fork rejected by pids controller in ");
- pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id));
+ pr_cont_cgroup_path(css->cgroup);
pr_cont("\n");
}
cgroup_file_notify(&pids->events_file);
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
new file mode 100644
index 000000000000..defad3c5e7dc
--- /dev/null
+++ b/kernel/cgroup/rdma.c
@@ -0,0 +1,619 @@
+/*
+ * RDMA resource limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop processes from consuming
+ * additional RDMA resources after a certain limit is reached.
+ *
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/cgroup.h>
+#include <linux/parser.h>
+#include <linux/cgroup_rdma.h>
+
+#define RDMACG_MAX_STR "max"
+
+/*
+ * Protects list of resource pools maintained on per cgroup basis
+ * and rdma device list.
+ */
+static DEFINE_MUTEX(rdmacg_mutex);
+static LIST_HEAD(rdmacg_devices);
+
+enum rdmacg_file_type {
+ RDMACG_RESOURCE_TYPE_MAX,
+ RDMACG_RESOURCE_TYPE_STAT,
+};
+
+/*
+ * resource table definition as to be seen by the user.
+ * Need to add entries to it when more resources are
+ * added/defined at IB verb/core layer.
+ */
+static char const *rdmacg_resource_names[] = {
+ [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle",
+ [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object",
+};
+
+/* resource tracker for each resource of rdma cgroup */
+struct rdmacg_resource {
+ int max;
+ int usage;
+};
+
+/*
+ * resource pool object which represents per cgroup, per device
+ * resources. There are multiple instances of this object per cgroup,
+ * therefore it cannot be embedded within rdma_cgroup structure. It
+ * is maintained as list.
+ */
+struct rdmacg_resource_pool {
+ struct rdmacg_device *device;
+ struct rdmacg_resource resources[RDMACG_RESOURCE_MAX];
+
+ struct list_head cg_node;
+ struct list_head dev_node;
+
+ /* count active user tasks of this pool */
+ u64 usage_sum;
+ /* total number counts which are set to max */
+ int num_max_cnt;
+};
+
+static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct rdma_cgroup, css);
+}
+
+static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
+{
+ return css_rdmacg(cg->css.parent);
+}
+
+static inline struct rdma_cgroup *get_current_rdmacg(void)
+{
+ return css_rdmacg(task_get_css(current, rdma_cgrp_id));
+}
+
+static void set_resource_limit(struct rdmacg_resource_pool *rpool,
+ int index, int new_max)
+{
+ if (new_max == S32_MAX) {
+ if (rpool->resources[index].max != S32_MAX)
+ rpool->num_max_cnt++;
+ } else {
+ if (rpool->resources[index].max == S32_MAX)
+ rpool->num_max_cnt--;
+ }
+ rpool->resources[index].max = new_max;
+}
+
+static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
+{
+ int i;
+
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
+ set_resource_limit(rpool, i, S32_MAX);
+}
+
+static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
+{
+ lockdep_assert_held(&rdmacg_mutex);
+
+ list_del(&rpool->cg_node);
+ list_del(&rpool->dev_node);
+ kfree(rpool);
+}
+
+static struct rdmacg_resource_pool *
+find_cg_rpool_locked(struct rdma_cgroup *cg,
+ struct rdmacg_device *device)
+
+{
+ struct rdmacg_resource_pool *pool;
+
+ lockdep_assert_held(&rdmacg_mutex);
+
+ list_for_each_entry(pool, &cg->rpools, cg_node)
+ if (pool->device == device)
+ return pool;
+
+ return NULL;
+}
+
+static struct rdmacg_resource_pool *
+get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
+{
+ struct rdmacg_resource_pool *rpool;
+
+ rpool = find_cg_rpool_locked(cg, device);
+ if (rpool)
+ return rpool;
+
+ rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
+ if (!rpool)
+ return ERR_PTR(-ENOMEM);
+
+ rpool->device = device;
+ set_all_resource_max_limit(rpool);
+
+ INIT_LIST_HEAD(&rpool->cg_node);
+ INIT_LIST_HEAD(&rpool->dev_node);
+ list_add_tail(&rpool->cg_node, &cg->rpools);
+ list_add_tail(&rpool->dev_node, &device->rpools);
+ return rpool;
+}
+
+/**
+ * uncharge_cg_locked - uncharge resource for rdma cgroup
+ * @cg: pointer to cg to uncharge and all parents in hierarchy
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cg (resource pool)
+ *
+ * It also frees the resource pool which was created as part of
+ * charging operation when there are no resources attached to
+ * resource pool.
+ */
+static void
+uncharge_cg_locked(struct rdma_cgroup *cg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ struct rdmacg_resource_pool *rpool;
+
+ rpool = find_cg_rpool_locked(cg, device);
+
+ /*
+ * rpool cannot be null at this stage. Let kernel operate in case
+ * if there a bug in IB stack or rdma controller, instead of crashing
+ * the system.
+ */
+ if (unlikely(!rpool)) {
+ pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
+ return;
+ }
+
+ rpool->resources[index].usage--;
+
+ /*
+ * A negative count (or overflow) is invalid,
+ * it indicates a bug in the rdma controller.
+ */
+ WARN_ON_ONCE(rpool->resources[index].usage < 0);
+ rpool->usage_sum--;
+ if (rpool->usage_sum == 0 &&
+ rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+ /*
+ * No user of the rpool and all entries are set to max, so
+ * safe to delete this rpool.
+ */
+ free_cg_rpool_locked(rpool);
+ }
+}
+
+/**
+ * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
+ * stop uncharging
+ * @index: index of the resource to uncharge in cg in given resource pool
+ */
+static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
+ struct rdmacg_device *device,
+ struct rdma_cgroup *stop_cg,
+ enum rdmacg_resource_type index)
+{
+ struct rdma_cgroup *p;
+
+ mutex_lock(&rdmacg_mutex);
+
+ for (p = cg; p != stop_cg; p = parent_rdmacg(p))
+ uncharge_cg_locked(p, device, index);
+
+ mutex_unlock(&rdmacg_mutex);
+
+ css_put(&cg->css);
+}
+
+/**
+ * rdmacg_uncharge - hierarchically uncharge rdma resource count
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to uncharge in cgroup in given resource pool
+ */
+void rdmacg_uncharge(struct rdma_cgroup *cg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ if (index >= RDMACG_RESOURCE_MAX)
+ return;
+
+ rdmacg_uncharge_hierarchy(cg, device, NULL, index);
+}
+EXPORT_SYMBOL(rdmacg_uncharge);
+
+/**
+ * rdmacg_try_charge - hierarchically try to charge the rdma resource
+ * @rdmacg: pointer to rdma cgroup which will own this resource
+ * @device: pointer to rdmacg device
+ * @index: index of the resource to charge in cgroup (resource pool)
+ *
+ * This function follows charging resource in hierarchical way.
+ * It will fail if the charge would cause the new value to exceed the
+ * hierarchical limit.
+ * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns pointer to rdmacg for this resource when charging is successful.
+ *
+ * Charger needs to account resources on two criteria.
+ * (a) per cgroup & (b) per device resource usage.
+ * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
+ * the configured limits. Per device provides granular configuration
+ * in multi device usage. It allocates resource pool in the hierarchy
+ * for each parent it come across for first resource. Later on resource
+ * pool will be available. Therefore it will be much faster thereon
+ * to charge/uncharge.
+ */
+int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
+ struct rdmacg_device *device,
+ enum rdmacg_resource_type index)
+{
+ struct rdma_cgroup *cg, *p;
+ struct rdmacg_resource_pool *rpool;
+ s64 new;
+ int ret = 0;
+
+ if (index >= RDMACG_RESOURCE_MAX)
+ return -EINVAL;
+
+ /*
+ * hold on to css, as cgroup can be removed but resource
+ * accounting happens on css.
+ */
+ cg = get_current_rdmacg();
+
+ mutex_lock(&rdmacg_mutex);
+ for (p = cg; p; p = parent_rdmacg(p)) {
+ rpool = get_cg_rpool_locked(p, device);
+ if (IS_ERR(rpool)) {
+ ret = PTR_ERR(rpool);
+ goto err;
+ } else {
+ new = rpool->resources[index].usage + 1;
+ if (new > rpool->resources[index].max) {
+ ret = -EAGAIN;
+ goto err;
+ } else {
+ rpool->resources[index].usage = new;
+ rpool->usage_sum++;
+ }
+ }
+ }
+ mutex_unlock(&rdmacg_mutex);
+
+ *rdmacg = cg;
+ return 0;
+
+err:
+ mutex_unlock(&rdmacg_mutex);
+ rdmacg_uncharge_hierarchy(cg, device, p, index);
+ return ret;
+}
+EXPORT_SYMBOL(rdmacg_try_charge);
+
+/**
+ * rdmacg_register_device - register rdmacg device to rdma controller.
+ * @device: pointer to rdmacg device whose resources need to be accounted.
+ *
+ * If IB stack wish a device to participate in rdma cgroup resource
+ * tracking, it must invoke this API to register with rdma cgroup before
+ * any user space application can start using the RDMA resources.
+ * Returns 0 on success or EINVAL when table length given is beyond
+ * supported size.
+ */
+int rdmacg_register_device(struct rdmacg_device *device)
+{
+ INIT_LIST_HEAD(&device->dev_node);
+ INIT_LIST_HEAD(&device->rpools);
+
+ mutex_lock(&rdmacg_mutex);
+ list_add_tail(&device->dev_node, &rdmacg_devices);
+ mutex_unlock(&rdmacg_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(rdmacg_register_device);
+
+/**
+ * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
+ * @device: pointer to rdmacg device which was previously registered with rdma
+ * controller using rdmacg_register_device().
+ *
+ * IB stack must invoke this after all the resources of the IB device
+ * are destroyed and after ensuring that no more resources will be created
+ * when this API is invoked.
+ */
+void rdmacg_unregister_device(struct rdmacg_device *device)
+{
+ struct rdmacg_resource_pool *rpool, *tmp;
+
+ /*
+ * Synchronize with any active resource settings,
+ * usage query happening via configfs.
+ */
+ mutex_lock(&rdmacg_mutex);
+ list_del_init(&device->dev_node);
+
+ /*
+ * Now that this device is off the cgroup list, its safe to free
+ * all the rpool resources.
+ */
+ list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
+ free_cg_rpool_locked(rpool);
+
+ mutex_unlock(&rdmacg_mutex);
+}
+EXPORT_SYMBOL(rdmacg_unregister_device);
+
+static int parse_resource(char *c, int *intval)
+{
+ substring_t argstr;
+ const char **table = &rdmacg_resource_names[0];
+ char *name, *value = c;
+ size_t len;
+ int ret, i = 0;
+
+ name = strsep(&value, "=");
+ if (!name || !value)
+ return -EINVAL;
+
+ len = strlen(value);
+
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+ if (strcmp(table[i], name))
+ continue;
+
+ argstr.from = value;
+ argstr.to = value + len;
+
+ ret = match_int(&argstr, intval);
+ if (ret >= 0) {
+ if (*intval < 0)
+ break;
+ return i;
+ }
+ if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
+ *intval = S32_MAX;
+ return i;
+ }
+ break;
+ }
+ return -EINVAL;
+}
+
+static int rdmacg_parse_limits(char *options,
+ int *new_limits, unsigned long *enables)
+{
+ char *c;
+ int err = -EINVAL;
+
+ /* parse resource options */
+ while ((c = strsep(&options, " ")) != NULL) {
+ int index, intval;
+
+ index = parse_resource(c, &intval);
+ if (index < 0)
+ goto err;
+
+ new_limits[index] = intval;
+ *enables |= BIT(index);
+ }
+ return 0;
+
+err:
+ return err;
+}
+
+static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
+{
+ struct rdmacg_device *device;
+
+ lockdep_assert_held(&rdmacg_mutex);
+
+ list_for_each_entry(device, &rdmacg_devices, dev_node)
+ if (!strcmp(name, device->name))
+ return device;
+
+ return NULL;
+}
+
+static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdma_cgroup *cg = css_rdmacg(of_css(of));
+ const char *dev_name;
+ struct rdmacg_resource_pool *rpool;
+ struct rdmacg_device *device;
+ char *options = strstrip(buf);
+ int *new_limits;
+ unsigned long enables = 0;
+ int i = 0, ret = 0;
+
+ /* extract the device name first */
+ dev_name = strsep(&options, " ");
+ if (!dev_name) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
+ if (!new_limits) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = rdmacg_parse_limits(options, new_limits, &enables);
+ if (ret)
+ goto parse_err;
+
+ /* acquire lock to synchronize with hot plug devices */
+ mutex_lock(&rdmacg_mutex);
+
+ device = rdmacg_get_device_locked(dev_name);
+ if (!device) {
+ ret = -ENODEV;
+ goto dev_err;
+ }
+
+ rpool = get_cg_rpool_locked(cg, device);
+ if (IS_ERR(rpool)) {
+ ret = PTR_ERR(rpool);
+ goto dev_err;
+ }
+
+ /* now set the new limits of the rpool */
+ for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
+ set_resource_limit(rpool, i, new_limits[i]);
+
+ if (rpool->usage_sum == 0 &&
+ rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
+ /*
+ * No user of the rpool and all entries are set to max, so
+ * safe to delete this rpool.
+ */
+ free_cg_rpool_locked(rpool);
+ }
+
+dev_err:
+ mutex_unlock(&rdmacg_mutex);
+
+parse_err:
+ kfree(new_limits);
+
+err:
+ return ret ?: nbytes;
+}
+
+static void print_rpool_values(struct seq_file *sf,
+ struct rdmacg_resource_pool *rpool)
+{
+ enum rdmacg_file_type sf_type;
+ int i;
+ u32 value;
+
+ sf_type = seq_cft(sf)->private;
+
+ for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
+ seq_puts(sf, rdmacg_resource_names[i]);
+ seq_putc(sf, '=');
+ if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
+ if (rpool)
+ value = rpool->resources[i].max;
+ else
+ value = S32_MAX;
+ } else {
+ if (rpool)
+ value = rpool->resources[i].usage;
+ else
+ value = 0;
+ }
+
+ if (value == S32_MAX)
+ seq_puts(sf, RDMACG_MAX_STR);
+ else
+ seq_printf(sf, "%d", value);
+ seq_putc(sf, ' ');
+ }
+}
+
+static int rdmacg_resource_read(struct seq_file *sf, void *v)
+{
+ struct rdmacg_device *device;
+ struct rdmacg_resource_pool *rpool;
+ struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
+
+ mutex_lock(&rdmacg_mutex);
+
+ list_for_each_entry(device, &rdmacg_devices, dev_node) {
+ seq_printf(sf, "%s ", device->name);
+
+ rpool = find_cg_rpool_locked(cg, device);
+ print_rpool_values(sf, rpool);
+
+ seq_putc(sf, '\n');
+ }
+
+ mutex_unlock(&rdmacg_mutex);
+ return 0;
+}
+
+static struct cftype rdmacg_files[] = {
+ {
+ .name = "max",
+ .write = rdmacg_resource_set_max,
+ .seq_show = rdmacg_resource_read,
+ .private = RDMACG_RESOURCE_TYPE_MAX,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .seq_show = rdmacg_resource_read,
+ .private = RDMACG_RESOURCE_TYPE_STAT,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* terminate */
+};
+
+static struct cgroup_subsys_state *
+rdmacg_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct rdma_cgroup *cg;
+
+ cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+ if (!cg)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&cg->rpools);
+ return &cg->css;
+}
+
+static void rdmacg_css_free(struct cgroup_subsys_state *css)
+{
+ struct rdma_cgroup *cg = css_rdmacg(css);
+
+ kfree(cg);
+}
+
+/**
+ * rdmacg_css_offline - cgroup css_offline callback
+ * @css: css of interest
+ *
+ * This function is called when @css is about to go away and responsible
+ * for shooting down all rdmacg associated with @css. As part of that it
+ * marks all the resource pool entries to max value, so that when resources are
+ * uncharged, associated resource pool can be freed as well.
+ */
+static void rdmacg_css_offline(struct cgroup_subsys_state *css)
+{
+ struct rdma_cgroup *cg = css_rdmacg(css);
+ struct rdmacg_resource_pool *rpool;
+
+ mutex_lock(&rdmacg_mutex);
+
+ list_for_each_entry(rpool, &cg->rpools, cg_node)
+ set_all_resource_max_limit(rpool);
+
+ mutex_unlock(&rdmacg_mutex);
+}
+
+struct cgroup_subsys rdma_cgrp_subsys = {
+ .css_alloc = rdmacg_css_alloc,
+ .css_free = rdmacg_css_free,
+ .css_offline = rdmacg_css_offline,
+ .legacy_cftypes = rdmacg_files,
+ .dfl_cftypes = rdmacg_files,
+};
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 1a8f34f63601..26a06e09a5bd 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -21,6 +21,7 @@ CONFIG_CP15_BARRIER_EMULATION=y
CONFIG_DEFAULT_SECURITY_SELINUX=y
CONFIG_EMBEDDED=y
CONFIG_FB=y
+CONFIG_HARDENED_USERCOPY=y
CONFIG_HIGH_RES_TIMERS=y
CONFIG_INET6_AH=y
CONFIG_INET6_ESP=y
@@ -129,6 +130,7 @@ CONFIG_PPP_DEFLATE=y
CONFIG_PPP_MPPE=y
CONFIG_PREEMPT=y
CONFIG_QUOTA=y
+CONFIG_RANDOMIZE_BASE=y
CONFIG_RTC_CLASS=y
CONFIG_RT_GROUP_SCHED=y
CONFIG_SECCOMP=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 297756be369c..28ee064b6744 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -1,4 +1,5 @@
# KEEP ALPHABETICALLY SORTED
+# CONFIG_AIO is not set
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
# CONFIG_INPUT_MOUSE is not set
# CONFIG_LEGACY_PTYS is not set
@@ -11,7 +12,7 @@ CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=8192
CONFIG_COMPACTION=y
-CONFIG_DEBUG_RODATA=y
+CONFIG_STRICT_KERNEL_RWX=y
CONFIG_DM_CRYPT=y
CONFIG_DM_UEVENT=y
CONFIG_DM_VERITY=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0a5f630f5c54..37b223e4fc05 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -7,7 +7,9 @@
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/notifier.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/task.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
@@ -1333,26 +1335,21 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
struct cpuhp_step *sp;
int ret = 0;
- mutex_lock(&cpuhp_state_mutex);
-
if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) {
ret = cpuhp_reserve_state(state);
if (ret < 0)
- goto out;
+ return ret;
state = ret;
}
sp = cpuhp_get_step(state);
- if (name && sp->name) {
- ret = -EBUSY;
- goto out;
- }
+ if (name && sp->name)
+ return -EBUSY;
+
sp->startup.single = startup;
sp->teardown.single = teardown;
sp->name = name;
sp->multi_instance = multi_instance;
INIT_HLIST_HEAD(&sp->list);
-out:
- mutex_unlock(&cpuhp_state_mutex);
return ret;
}
@@ -1426,6 +1423,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
return -EINVAL;
get_online_cpus();
+ mutex_lock(&cpuhp_state_mutex);
if (!invoke || !sp->startup.multi)
goto add_node;
@@ -1445,16 +1443,14 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
if (ret) {
if (sp->teardown.multi)
cpuhp_rollback_install(cpu, state, node);
- goto err;
+ goto unlock;
}
}
add_node:
ret = 0;
- mutex_lock(&cpuhp_state_mutex);
hlist_add_head(node, &sp->list);
+unlock:
mutex_unlock(&cpuhp_state_mutex);
-
-err:
put_online_cpus();
return ret;
}
@@ -1489,6 +1485,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
return -EINVAL;
get_online_cpus();
+ mutex_lock(&cpuhp_state_mutex);
ret = cpuhp_store_callbacks(state, name, startup, teardown,
multi_instance);
@@ -1522,6 +1519,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
}
}
out:
+ mutex_unlock(&cpuhp_state_mutex);
put_online_cpus();
/*
* If the requested state is CPUHP_AP_ONLINE_DYN, return the
@@ -1545,6 +1543,8 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,
return -EINVAL;
get_online_cpus();
+ mutex_lock(&cpuhp_state_mutex);
+
if (!invoke || !cpuhp_get_teardown_cb(state))
goto remove;
/*
@@ -1561,7 +1561,6 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,
}
remove:
- mutex_lock(&cpuhp_state_mutex);
hlist_del(node);
mutex_unlock(&cpuhp_state_mutex);
put_online_cpus();
@@ -1569,6 +1568,7 @@ remove:
return 0;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
+
/**
* __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
* @state: The state to remove
@@ -1587,6 +1587,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
get_online_cpus();
+ mutex_lock(&cpuhp_state_mutex);
if (sp->multi_instance) {
WARN(!hlist_empty(&sp->list),
"Error: Removing state %d which has instances left.\n",
@@ -1611,6 +1612,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
}
remove:
cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
+ mutex_unlock(&cpuhp_state_mutex);
put_online_cpus();
}
EXPORT_SYMBOL(__cpuhp_remove_state);
diff --git a/kernel/cred.c b/kernel/cred.c
index 5f264fb5737d..2bc66075740f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -12,6 +12,7 @@
#include <linux/cred.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/coredump.h>
#include <linux/key.h>
#include <linux/keyctl.h>
#include <linux/init_task.h>
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 79517e5549f1..65c0f1363788 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -49,6 +49,7 @@
#include <linux/init.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
+#include <linux/nmi.h>
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
@@ -232,9 +233,9 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
int i;
for (i = 0; i < VMACACHE_SIZE; i++) {
- if (!current->vmacache[i])
+ if (!current->vmacache.vmas[i])
continue;
- flush_cache_range(current->vmacache[i],
+ flush_cache_range(current->vmacache.vmas[i],
addr, addr + BREAK_INSTR_SIZE);
}
}
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 19d9a578c753..7510dc687c0d 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -29,6 +29,7 @@
*/
#include <linux/kernel.h>
+#include <linux/sched/signal.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
#include <linux/serial_core.h>
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index fe15fff5df53..6ad4a9fcbd6f 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -12,7 +12,8 @@
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/kdb.h>
#include <linux/nmi.h>
#include "kdb_private.h"
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ca183919d302..c8146d53ca67 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -18,6 +18,9 @@
#include <linux/kmsg_dump.h>
#include <linux/reboot.h>
#include <linux/sched.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/debug.h>
#include <linux/sysrq.h>
#include <linux/smp.h>
#include <linux/utsname.h>
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 660549656991..4a1c33416b6a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -14,6 +14,8 @@
*/
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/cputime.h>
#include <linux/slab.h>
#include <linux/taskstats.h>
#include <linux/time.h>
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index e9fdb5203de5..c04917cad1bf 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -11,6 +11,8 @@
#include <linux/perf_event.h>
#include <linux/slab.h>
+#include <linux/sched/task_stack.h>
+
#include "internal.h"
struct callchain_cpus_entries {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 77a932b54a64..ff01cba86f43 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -46,6 +46,8 @@
#include <linux/filter.h>
#include <linux/namei.h>
#include <linux/parser.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/mm.h>
#include "internal.h"
@@ -455,7 +457,7 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
@@ -996,7 +998,7 @@ list_update_cgroup_event(struct perf_event *event,
*/
#define PERF_CPU_HRTIMER (1000 / HZ)
/*
- * function must be called with interrupts disbled
+ * function must be called with interrupts disabled
*/
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
@@ -3522,6 +3524,8 @@ static void perf_event_enable_on_exec(int ctxn)
if (enabled) {
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
+ } else {
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -4252,7 +4256,7 @@ int perf_event_release_kernel(struct perf_event *event)
raw_spin_lock_irq(&ctx->lock);
/*
- * Mark this even as STATE_DEAD, there is no external reference to it
+ * Mark this event as STATE_DEAD, there is no external reference to it
* anymore.
*
* Anybody acquiring event->child_mutex after the below loop _must_
@@ -4925,9 +4929,9 @@ unlock:
rcu_read_unlock();
}
-static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int perf_mmap_fault(struct vm_fault *vmf)
{
- struct perf_event *event = vma->vm_file->private_data;
+ struct perf_event *event = vmf->vma->vm_file->private_data;
struct ring_buffer *rb;
int ret = VM_FAULT_SIGBUS;
@@ -4950,7 +4954,7 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
goto unlock;
get_page(vmf->page);
- vmf->page->mapping = vma->vm_file->f_mapping;
+ vmf->page->mapping = vmf->vma->vm_file->f_mapping;
vmf->page->index = vmf->pgoff;
ret = 0;
@@ -9955,6 +9959,7 @@ SYSCALL_DEFINE5(perf_event_open,
* of swizzling perf_event::ctx.
*/
perf_remove_from_context(group_leader, 0);
+ put_ctx(gctx);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
@@ -9993,13 +9998,6 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
get_ctx(ctx);
-
- /*
- * Now that all events are installed in @ctx, nothing
- * references @gctx anymore, so drop the last reference we have
- * on it.
- */
- put_ctx(gctx);
}
/*
@@ -10419,21 +10417,22 @@ void perf_event_free_task(struct task_struct *task)
continue;
mutex_lock(&ctx->mutex);
-again:
- list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
- group_entry)
- perf_free_event(event, ctx);
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Destroy the task <-> ctx relation and mark the context dead.
+ *
+ * This is important because even though the task hasn't been
+ * exposed yet the context has been (through child_list).
+ */
+ RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
+ raw_spin_unlock_irq(&ctx->lock);
- list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
- group_entry)
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
perf_free_event(event, ctx);
- if (!list_empty(&ctx->pinned_groups) ||
- !list_empty(&ctx->flexible_groups))
- goto again;
-
mutex_unlock(&ctx->mutex);
-
put_ctx(ctx);
}
}
@@ -10471,7 +10470,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
}
/*
- * inherit a event from parent task to child task:
+ * Inherit a event from parent task to child task.
+ *
+ * Returns:
+ * - valid pointer on success
+ * - NULL for orphaned events
+ * - IS_ERR() on error
*/
static struct perf_event *
inherit_event(struct perf_event *parent_event,
@@ -10565,6 +10569,16 @@ inherit_event(struct perf_event *parent_event,
return child_event;
}
+/*
+ * Inherits an event group.
+ *
+ * This will quietly suppress orphaned events; !inherit_event() is not an error.
+ * This matches with perf_event_release_kernel() removing all child events.
+ *
+ * Returns:
+ * - 0 on success
+ * - <0 on error
+ */
static int inherit_group(struct perf_event *parent_event,
struct task_struct *parent,
struct perf_event_context *parent_ctx,
@@ -10579,6 +10593,11 @@ static int inherit_group(struct perf_event *parent_event,
child, NULL, child_ctx);
if (IS_ERR(leader))
return PTR_ERR(leader);
+ /*
+ * @leader can be NULL here because of is_orphaned_event(). In this
+ * case inherit_event() will create individual events, similar to what
+ * perf_group_detach() would do anyway.
+ */
list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
child_ctr = inherit_event(sub, parent, parent_ctx,
child, leader, child_ctx);
@@ -10588,6 +10607,17 @@ static int inherit_group(struct perf_event *parent_event,
return 0;
}
+/*
+ * Creates the child task context and tries to inherit the event-group.
+ *
+ * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
+ * inherited_all set when we 'fail' to inherit an orphaned event; this is
+ * consistent with perf_event_release_kernel() removing all child events.
+ *
+ * Returns:
+ * - 0 on success
+ * - <0 on error
+ */
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
@@ -10610,7 +10640,6 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
-
child_ctx = alloc_perf_context(parent_ctx->pmu, child);
if (!child_ctx)
return -ENOMEM;
@@ -10672,7 +10701,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
- break;
+ goto out_unlock;
}
/*
@@ -10688,7 +10717,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
- break;
+ goto out_unlock;
}
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
@@ -10716,6 +10745,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
}
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+out_unlock:
mutex_unlock(&parent_ctx->mutex);
perf_unpin_context(parent_ctx);
@@ -10959,5 +10989,11 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
.attach = perf_cgroup_attach,
+ /*
+ * Implicitly enable on dfl hierarchy so that perf events can
+ * always be filtered by cgroup2 path as long as perf_event
+ * controller is not mounted on a legacy hierarchy.
+ */
+ .implicit_on_dfl = true,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d416f3baf392..0e137f98a50c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,8 @@
#include <linux/pagemap.h> /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
@@ -153,14 +155,19 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
- pte_t *ptep;
+ struct page_vma_mapped_walk pvmw = {
+ .page = old_page,
+ .vma = vma,
+ .address = addr,
+ };
int err;
/* For mmu_notifiers */
const unsigned long mmun_start = addr;
const unsigned long mmun_end = addr + PAGE_SIZE;
struct mem_cgroup *memcg;
+ VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
+
err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg,
false);
if (err)
@@ -171,11 +178,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
- ptep = page_check_address(old_page, mm, addr, &ptl, 0);
- if (!ptep) {
+ if (!page_vma_mapped_walk(&pvmw)) {
mem_cgroup_cancel_charge(new_page, memcg, false);
goto unlock;
}
+ VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
get_page(new_page);
page_add_new_anon_rmap(new_page, vma, addr, false);
@@ -187,14 +194,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
inc_mm_counter(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush_notify(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot));
+ flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
+ ptep_clear_flush_notify(vma, addr, pvmw.pte);
+ set_pte_at_notify(mm, addr, pvmw.pte,
+ mk_pte(new_page, vma->vm_page_prot));
page_remove_rmap(old_page, false);
if (!page_mapped(old_page))
try_to_free_swap(old_page);
- pte_unmap_unlock(ptep, ptl);
+ page_vma_mapped_walk_done(&pvmw);
if (vma->vm_flags & VM_LOCKED)
munlock_vma_page(old_page);
@@ -300,8 +308,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
retry:
/* Read the page with vaddr into memory */
- ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
- &vma, NULL);
+ ret = get_user_pages_remote(NULL, mm, vaddr, 1,
+ FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL);
if (ret <= 0)
return ret;
@@ -741,7 +749,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
continue;
}
- if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
+ if (!mmget_not_zero(vma->vm_mm))
continue;
info = prev;
diff --git a/kernel/exit.c b/kernel/exit.c
index b67c57faa705..516acdb0e0ec 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -6,6 +6,12 @@
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/sched/autogroup.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/capability.h>
@@ -14,7 +20,6 @@
#include <linux/tty.h>
#include <linux/iocontext.h>
#include <linux/key.h>
-#include <linux/security.h>
#include <linux/cpu.h>
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
@@ -46,6 +51,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
+#include <linux/userfaultfd_k.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
@@ -539,7 +545,7 @@ static void exit_mm(void)
__set_current_state(TASK_RUNNING);
down_read(&mm->mmap_sem);
}
- atomic_inc(&mm->mm_count);
+ mmgrab(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
@@ -608,15 +614,18 @@ static struct task_struct *find_new_reaper(struct task_struct *father,
return thread;
if (father->signal->has_child_subreaper) {
+ unsigned int ns_level = task_pid(father)->level;
/*
* Find the first ->is_child_subreaper ancestor in our pid_ns.
- * We start from father to ensure we can not look into another
- * namespace, this is safe because all its threads are dead.
+ * We can't check reaper != child_reaper to ensure we do not
+ * cross the namespaces, the exiting parent could be injected
+ * by setns() + fork().
+ * We check pid->level, this is slightly more efficient than
+ * task_active_pid_ns(reaper) != task_active_pid_ns(father).
*/
- for (reaper = father;
- !same_thread_group(reaper, child_reaper);
+ for (reaper = father->real_parent;
+ task_pid(reaper)->level == ns_level;
reaper = reaper->real_parent) {
- /* call_usermodehelper() descendants need this check */
if (reaper == &init_task)
break;
if (!reaper->signal->is_child_subreaper)
@@ -1390,7 +1399,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue;
* then ->notask_error is 0 if @p is an eligible child,
- * or another error from security_task_wait(), or still -ECHILD.
+ * or still -ECHILD.
*/
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
@@ -1410,20 +1419,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
if (!ret)
return ret;
- ret = security_task_wait(p);
- if (unlikely(ret < 0)) {
- /*
- * If we have not yet seen any eligible child,
- * then let this error code replace -ECHILD.
- * A permission error will give the user a clue
- * to look for security policy problems, rather
- * than for mysterious wait bugs.
- */
- if (wo->notask_error)
- wo->notask_error = ret;
- return 0;
- }
-
if (unlikely(exit_state == EXIT_TRACE)) {
/*
* ptrace == 0 means we are the natural parent. In this case
@@ -1516,7 +1511,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue; then
* ->notask_error is 0 if there were any eligible children,
- * or another error from security_task_wait(), or still -ECHILD.
+ * or still -ECHILD.
*/
static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
diff --git a/kernel/extable.c b/kernel/extable.c
index e1359474baa5..2676d7f8baf6 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -17,10 +17,12 @@
*/
#include <linux/ftrace.h>
#include <linux/memory.h>
+#include <linux/extable.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/kprobes.h>
+#include <linux/filter.h>
#include <asm/sections.h>
#include <linux/uaccess.h>
@@ -107,6 +109,8 @@ int __kernel_text_address(unsigned long addr)
return 1;
if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
return 1;
+ if (is_bpf_text_address(addr))
+ return 1;
/*
* There might be init symbols in saved stacktraces.
* Give those symbols a chance to be printed in
@@ -130,6 +134,8 @@ int kernel_text_address(unsigned long addr)
return 1;
if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
return 1;
+ if (is_bpf_text_address(addr))
+ return 1;
return 0;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index ff82e24573b6..6c463c80e93d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -12,6 +12,16 @@
*/
#include <linux/slab.h>
+#include <linux/sched/autogroup.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/user.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
+#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
@@ -55,6 +65,7 @@
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
+#include <linux/userfaultfd_k.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
@@ -561,6 +572,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
+ LIST_HEAD(uf);
uprobe_start_dup_mmap();
if (down_write_killable(&oldmm->mmap_sem)) {
@@ -617,12 +629,13 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (retval)
goto fail_nomem_policy;
tmp->vm_mm = mm;
+ retval = dup_userfaultfd(tmp, &uf);
+ if (retval)
+ goto fail_nomem_anon_vma_fork;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &=
- ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
+ tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
tmp->vm_next = tmp->vm_prev = NULL;
- tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
if (file) {
struct inode *inode = file_inode(file);
@@ -678,6 +691,7 @@ out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
up_write(&oldmm->mmap_sem);
+ dup_userfaultfd_complete(&uf);
fail_uprobe_end:
uprobe_end_dup_mmap();
return retval;
@@ -996,7 +1010,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
if (task->flags & PF_KTHREAD)
mm = NULL;
else
- atomic_inc(&mm->mm_users);
+ mmget(mm);
}
task_unlock(task);
return mm;
@@ -1184,7 +1198,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
vmacache_flush(tsk);
if (clone_flags & CLONE_VM) {
- atomic_inc(&oldmm->mm_users);
+ mmget(oldmm);
mm = oldmm;
goto good_mm;
}
@@ -1373,9 +1387,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
- sig->has_child_subreaper = current->signal->has_child_subreaper ||
- current->signal->is_child_subreaper;
-
mutex_init(&sig->cred_guard_mutex);
return 0;
@@ -1454,6 +1465,21 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
task->pids[type].pid = pid;
}
+static inline void rcu_copy_process(struct task_struct *p)
+{
+#ifdef CONFIG_PREEMPT_RCU
+ p->rcu_read_lock_nesting = 0;
+ p->rcu_read_unlock_special.s = 0;
+ p->rcu_blocked_node = NULL;
+ INIT_LIST_HEAD(&p->rcu_node_entry);
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TASKS_RCU
+ p->rcu_tasks_holdout = false;
+ INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
+ p->rcu_tasks_idle_cpu = -1;
+#endif /* #ifdef CONFIG_TASKS_RCU */
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1745,7 +1771,7 @@ static __latent_entropy struct task_struct *copy_process(
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
- threadgroup_change_begin(current);
+ cgroup_threadgroup_change_begin(current);
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
@@ -1810,6 +1836,13 @@ static __latent_entropy struct task_struct *copy_process(
p->signal->leader_pid = pid;
p->signal->tty = tty_kref_get(current->signal->tty);
+ /*
+ * Inherit has_child_subreaper flag under the same
+ * tasklist_lock with adding child to the process tree
+ * for propagate_has_child_subreaper optimization.
+ */
+ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
+ p->real_parent->signal->is_child_subreaper;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_PGID);
@@ -1835,7 +1868,7 @@ static __latent_entropy struct task_struct *copy_process(
proc_fork_connector(p);
cgroup_post_fork(p);
- threadgroup_change_end(current);
+ cgroup_threadgroup_change_end(current);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
@@ -1846,7 +1879,7 @@ static __latent_entropy struct task_struct *copy_process(
bad_fork_cancel_cgroup:
cgroup_cancel_fork(p);
bad_fork_free_pid:
- threadgroup_change_end(current);
+ cgroup_threadgroup_change_end(current);
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_thread:
@@ -2063,6 +2096,38 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
}
#endif
+void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
+{
+ struct task_struct *leader, *parent, *child;
+ int res;
+
+ read_lock(&tasklist_lock);
+ leader = top = top->group_leader;
+down:
+ for_each_thread(leader, parent) {
+ list_for_each_entry(child, &parent->children, sibling) {
+ res = visitor(child, data);
+ if (res) {
+ if (res < 0)
+ goto out;
+ leader = child;
+ goto down;
+ }
+up:
+ ;
+ }
+ }
+
+ if (leader != top) {
+ child = leader;
+ parent = child->real_parent;
+ leader = parent->group_leader;
+ goto up;
+ }
+out:
+ read_unlock(&tasklist_lock);
+}
+
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index cdf365036141..45858ec73941 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -61,6 +61,8 @@
#include <linux/nsproxy.h>
#include <linux/ptrace.h>
#include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/mm.h>
#include <linux/hugetlb.h>
#include <linux/freezer.h>
#include <linux/bootmem.h>
@@ -338,7 +340,7 @@ static inline bool should_fail_futex(bool fshared)
static inline void futex_get_mm(union futex_key *key)
{
- atomic_inc(&key->private.mm->mm_count);
+ mmgrab(key->private.mm);
/*
* Ensure futex_get_mm() implies a full barrier such that
* get_futex_key() implies a full barrier. This is relied upon
@@ -2813,7 +2815,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
{
struct hrtimer_sleeper timeout, *to = NULL;
struct rt_mutex_waiter rt_waiter;
- struct rt_mutex *pi_mutex = NULL;
struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
@@ -2897,6 +2898,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
+ rt_mutex_unlock(&q.pi_state->pi_mutex);
/*
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
@@ -2905,6 +2908,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
spin_unlock(q.lock_ptr);
}
} else {
+ struct rt_mutex *pi_mutex;
+
/*
* We have been woken up by futex_unlock_pi(), a timeout, or a
* signal. futex_unlock_pi() will not destroy the lock_ptr nor
@@ -2928,18 +2933,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (res)
ret = (res < 0) ? res : 0;
+ /*
+ * If fixup_pi_state_owner() faulted and was unable to handle
+ * the fault, unlock the rt_mutex and return the fault to
+ * userspace.
+ */
+ if (ret && rt_mutex_owner(pi_mutex) == current)
+ rt_mutex_unlock(pi_mutex);
+
/* Unqueue and drop the lock. */
unqueue_me_pi(&q);
}
- /*
- * If fixup_pi_state_owner() faulted and was unable to handle the
- * fault, unlock the rt_mutex and return the fault to userspace.
- */
- if (ret == -EFAULT) {
- if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
- rt_mutex_unlock(pi_mutex);
- } else if (ret == -EINTR) {
+ if (ret == -EINTR) {
/*
* We've already been requeued, but cannot restart by calling
* futex_lock_pi() directly. We could restart this syscall, but
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 40c07e4fa116..f0f8e2a8496f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -16,6 +16,9 @@
#include <linux/export.h>
#include <linux/sysctl.h>
#include <linux/utsname.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
+
#include <trace/events/sched.h>
/*
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4544b115f5eb..d052947fe785 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -59,7 +59,7 @@ static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
struct cpumask *
irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
{
- int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec;
+ int n, nodes, cpus_per_vec, extra_vecs, curvec;
int affv = nvecs - affd->pre_vectors - affd->post_vectors;
int last_affv = affv + affd->pre_vectors;
nodemask_t nodemsk = NODE_MASK_NONE;
@@ -94,19 +94,21 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
goto done;
}
- /* Spread the vectors per node */
- vecs_per_node = affv / nodes;
- /* Account for rounding errors */
- extra_vecs = affv - (nodes * vecs_per_node);
-
for_each_node_mask(n, nodemsk) {
- int ncpus, v, vecs_to_assign = vecs_per_node;
+ int ncpus, v, vecs_to_assign, vecs_per_node;
+
+ /* Spread the vectors per node */
+ vecs_per_node = (affv - curvec) / nodes;
/* Get the cpus on this node which are in the mask */
cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n));
/* Calculate the number of cpus per vector */
ncpus = cpumask_weight(nmsk);
+ vecs_to_assign = min(vecs_per_node, ncpus);
+
+ /* Account for rounding errors */
+ extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign);
for (v = 0; curvec < last_affv && v < vecs_to_assign;
curvec++, v++) {
@@ -115,14 +117,14 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
/* Account for extra vectors to compensate rounding errors */
if (extra_vecs) {
cpus_per_vec++;
- if (!--extra_vecs)
- vecs_per_node++;
+ --extra_vecs;
}
irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
}
if (curvec >= last_affv)
break;
+ --nodes;
}
done:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6b669593e7eb..a4afe5cc5af1 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,8 @@
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+#include <uapi/linux/sched/types.h>
#include <linux/task_work.h>
#include "internals.h"
@@ -353,7 +355,7 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
return 0;
/*
- * Preserve the managed affinity setting and an userspace affinity
+ * Preserve the managed affinity setting and a userspace affinity
* setup, but make sure that one of the targets is online.
*/
if (irqd_affinity_is_managed(&desc->irq_data) ||
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index a9b8cf500591..6c9cb208ac48 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -236,12 +236,28 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry
static inline struct jump_entry *static_key_entries(struct static_key *key)
{
- return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
+ WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED);
+ return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK);
}
static inline bool static_key_type(struct static_key *key)
{
- return (unsigned long)key->entries & JUMP_TYPE_MASK;
+ return key->type & JUMP_TYPE_TRUE;
+}
+
+static inline bool static_key_linked(struct static_key *key)
+{
+ return key->type & JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_clear_linked(struct static_key *key)
+{
+ key->type &= ~JUMP_TYPE_LINKED;
+}
+
+static inline void static_key_set_linked(struct static_key *key)
+{
+ key->type |= JUMP_TYPE_LINKED;
}
static inline struct static_key *jump_entry_key(struct jump_entry *entry)
@@ -254,6 +270,26 @@ static bool jump_entry_branch(struct jump_entry *entry)
return (unsigned long)entry->key & 1UL;
}
+/***
+ * A 'struct static_key' uses a union such that it either points directly
+ * to a table of 'struct jump_entry' or to a linked list of modules which in
+ * turn point to 'struct jump_entry' tables.
+ *
+ * The two lower bits of the pointer are used to keep track of which pointer
+ * type is in use and to store the initial branch direction, we use an access
+ * function which preserves these bits.
+ */
+static void static_key_set_entries(struct static_key *key,
+ struct jump_entry *entries)
+{
+ unsigned long type;
+
+ WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK);
+ type = key->type & JUMP_TYPE_MASK;
+ key->entries = entries;
+ key->type |= type;
+}
+
static enum jump_label_type jump_label_type(struct jump_entry *entry)
{
struct static_key *key = jump_entry_key(entry);
@@ -313,13 +349,7 @@ void __init jump_label_init(void)
continue;
key = iterk;
- /*
- * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
- */
- *((unsigned long *)&key->entries) += (unsigned long)iter;
-#ifdef CONFIG_MODULES
- key->next = NULL;
-#endif
+ static_key_set_entries(key, iter);
}
static_key_initialized = true;
jump_label_unlock();
@@ -343,6 +373,29 @@ struct static_key_mod {
struct module *mod;
};
+static inline struct static_key_mod *static_key_mod(struct static_key *key)
+{
+ WARN_ON_ONCE(!(key->type & JUMP_TYPE_LINKED));
+ return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK);
+}
+
+/***
+ * key->type and key->next are the same via union.
+ * This sets key->next and preserves the type bits.
+ *
+ * See additional comments above static_key_set_entries().
+ */
+static void static_key_set_mod(struct static_key *key,
+ struct static_key_mod *mod)
+{
+ unsigned long type;
+
+ WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK);
+ type = key->type & JUMP_TYPE_MASK;
+ key->next = mod;
+ key->type |= type;
+}
+
static int __jump_label_mod_text_reserved(void *start, void *end)
{
struct module *mod;
@@ -365,11 +418,23 @@ static void __jump_label_mod_update(struct static_key *key)
{
struct static_key_mod *mod;
- for (mod = key->next; mod; mod = mod->next) {
- struct module *m = mod->mod;
+ for (mod = static_key_mod(key); mod; mod = mod->next) {
+ struct jump_entry *stop;
+ struct module *m;
+
+ /*
+ * NULL if the static_key is defined in a module
+ * that does not use it
+ */
+ if (!mod->entries)
+ continue;
- __jump_label_update(key, mod->entries,
- m->jump_entries + m->num_jump_entries);
+ m = mod->mod;
+ if (!m)
+ stop = __stop___jump_table;
+ else
+ stop = m->jump_entries + m->num_jump_entries;
+ __jump_label_update(key, mod->entries, stop);
}
}
@@ -404,7 +469,7 @@ static int jump_label_add_module(struct module *mod)
struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
struct jump_entry *iter;
struct static_key *key = NULL;
- struct static_key_mod *jlm;
+ struct static_key_mod *jlm, *jlm2;
/* if the module doesn't have jump label entries, just return */
if (iter_start == iter_stop)
@@ -421,20 +486,32 @@ static int jump_label_add_module(struct module *mod)
key = iterk;
if (within_module(iter->key, mod)) {
- /*
- * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
- */
- *((unsigned long *)&key->entries) += (unsigned long)iter;
- key->next = NULL;
+ static_key_set_entries(key, iter);
continue;
}
jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
if (!jlm)
return -ENOMEM;
+ if (!static_key_linked(key)) {
+ jlm2 = kzalloc(sizeof(struct static_key_mod),
+ GFP_KERNEL);
+ if (!jlm2) {
+ kfree(jlm);
+ return -ENOMEM;
+ }
+ preempt_disable();
+ jlm2->mod = __module_address((unsigned long)key);
+ preempt_enable();
+ jlm2->entries = static_key_entries(key);
+ jlm2->next = NULL;
+ static_key_set_mod(key, jlm2);
+ static_key_set_linked(key);
+ }
jlm->mod = mod;
jlm->entries = iter;
- jlm->next = key->next;
- key->next = jlm;
+ jlm->next = static_key_mod(key);
+ static_key_set_mod(key, jlm);
+ static_key_set_linked(key);
/* Only update if we've changed from our initial state */
if (jump_label_type(iter) != jump_label_init_type(iter))
@@ -461,16 +538,34 @@ static void jump_label_del_module(struct module *mod)
if (within_module(iter->key, mod))
continue;
+ /* No memory during module load */
+ if (WARN_ON(!static_key_linked(key)))
+ continue;
+
prev = &key->next;
- jlm = key->next;
+ jlm = static_key_mod(key);
while (jlm && jlm->mod != mod) {
prev = &jlm->next;
jlm = jlm->next;
}
- if (jlm) {
+ /* No memory during module load */
+ if (WARN_ON(!jlm))
+ continue;
+
+ if (prev == &key->next)
+ static_key_set_mod(key, jlm->next);
+ else
*prev = jlm->next;
+
+ kfree(jlm);
+
+ jlm = static_key_mod(key);
+ /* if only one etry is left, fold it back into the static_key */
+ if (jlm->next == NULL) {
+ static_key_set_entries(key, jlm->entries);
+ static_key_clear_linked(key);
kfree(jlm);
}
}
@@ -499,8 +594,10 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
case MODULE_STATE_COMING:
jump_label_lock();
ret = jump_label_add_module(mod);
- if (ret)
+ if (ret) {
+ WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n");
jump_label_del_module(mod);
+ }
jump_label_unlock();
break;
case MODULE_STATE_GOING:
@@ -561,11 +658,14 @@ int jump_label_text_reserved(void *start, void *end)
static void jump_label_update(struct static_key *key)
{
struct jump_entry *stop = __stop___jump_table;
- struct jump_entry *entry = static_key_entries(key);
+ struct jump_entry *entry;
#ifdef CONFIG_MODULES
struct module *mod;
- __jump_label_mod_update(key);
+ if (static_key_linked(key)) {
+ __jump_label_mod_update(key);
+ return;
+ }
preempt_disable();
mod = __module_address((unsigned long)key);
@@ -573,6 +673,7 @@ static void jump_label_update(struct static_key *key)
stop = mod->jump_entries + mod->num_jump_entries;
preempt_enable();
#endif
+ entry = static_key_entries(key);
/* if there are no users, entry can be NULL */
if (entry)
__jump_label_update(key, entry, stop);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fafd1a3ef0da..6a3b249a2ae1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -23,6 +23,7 @@
#include <linux/mm.h>
#include <linux/ctype.h>
#include <linux/slab.h>
+#include <linux/filter.h>
#include <linux/compiler.h>
#include <asm/sections.h>
@@ -300,10 +301,11 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
unsigned long *offset)
{
char namebuf[KSYM_NAME_LEN];
+
if (is_ksym_addr(addr))
return !!get_symbol_pos(addr, symbolsize, offset);
-
- return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
+ return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
+ !!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
}
/*
@@ -318,6 +320,8 @@ const char *kallsyms_lookup(unsigned long addr,
unsigned long *offset,
char **modname, char *namebuf)
{
+ const char *ret;
+
namebuf[KSYM_NAME_LEN - 1] = 0;
namebuf[0] = 0;
@@ -333,9 +337,13 @@ const char *kallsyms_lookup(unsigned long addr,
return namebuf;
}
- /* See if it's in a module. */
- return module_address_lookup(addr, symbolsize, offset, modname,
- namebuf);
+ /* See if it's in a module or a BPF JITed image. */
+ ret = module_address_lookup(addr, symbolsize, offset,
+ modname, namebuf);
+ if (!ret)
+ ret = bpf_address_lookup(addr, symbolsize,
+ offset, modname, namebuf);
+ return ret;
}
int lookup_symbol_name(unsigned long addr, char *symname)
@@ -471,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol);
/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
struct kallsym_iter {
loff_t pos;
+ loff_t pos_mod_end;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
@@ -481,13 +490,27 @@ struct kallsym_iter {
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
- if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
- &iter->type, iter->name, iter->module_name,
- &iter->exported) < 0)
+ int ret = module_get_kallsym(iter->pos - kallsyms_num_syms,
+ &iter->value, &iter->type,
+ iter->name, iter->module_name,
+ &iter->exported);
+ if (ret < 0) {
+ iter->pos_mod_end = iter->pos;
return 0;
+ }
+
return 1;
}
+static int get_ksymbol_bpf(struct kallsym_iter *iter)
+{
+ iter->module_name[0] = '\0';
+ iter->exported = 0;
+ return bpf_get_kallsym(iter->pos - iter->pos_mod_end,
+ &iter->value, &iter->type,
+ iter->name) < 0 ? 0 : 1;
+}
+
/* Returns space to next name. */
static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
{
@@ -508,16 +531,30 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
iter->name[0] = '\0';
iter->nameoff = get_symbol_offset(new_pos);
iter->pos = new_pos;
+ if (new_pos == 0)
+ iter->pos_mod_end = 0;
+}
+
+static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
+{
+ iter->pos = pos;
+
+ if (iter->pos_mod_end > 0 &&
+ iter->pos_mod_end < iter->pos)
+ return get_ksymbol_bpf(iter);
+
+ if (!get_ksymbol_mod(iter))
+ return get_ksymbol_bpf(iter);
+
+ return 1;
}
/* Returns false if pos at or past end of file. */
static int update_iter(struct kallsym_iter *iter, loff_t pos)
{
/* Module symbols can be accessed randomly. */
- if (pos >= kallsyms_num_syms) {
- iter->pos = pos;
- return get_ksymbol_mod(iter);
- }
+ if (pos >= kallsyms_num_syms)
+ return update_iter_mod(iter, pos);
/* If we're not on the desired position, reset to new position. */
if (pos != iter->pos)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 5617cc412444..bfe62d5b3872 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -916,7 +916,7 @@ void crash_kexec(struct pt_regs *regs)
old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
if (old_cpu == PANIC_CPU_INVALID) {
/* This is the 1st CPU which comes here, so go ahead. */
- printk_nmi_flush_on_panic();
+ printk_safe_flush_on_panic();
__crash_kexec(regs);
/*
@@ -1399,7 +1399,7 @@ void __weak arch_crash_save_vmcoreinfo(void)
phys_addr_t __weak paddr_vmcoreinfo_note(void)
{
- return __pa((unsigned long)(char *)&vmcoreinfo_note);
+ return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
}
static int __init crash_save_vmcoreinfo_init(void)
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b56a558e406d..b118735fea9d 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -614,13 +614,13 @@ static int kexec_calculate_store_digests(struct kimage *image)
ret = crypto_shash_final(desc, digest);
if (ret)
goto out_free_digest;
- ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
- sha_regions, sha_region_sz, 0);
+ ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions",
+ sha_regions, sha_region_sz, 0);
if (ret)
goto out_free_digest;
- ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
- digest, SHA256_DIGEST_SIZE, 0);
+ ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
if (ret)
goto out_free_digest;
}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 4cef7e4706b0..799a8a452187 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -15,11 +15,7 @@ int kimage_is_destination_range(struct kimage *image,
extern struct mutex kexec_mutex;
#ifdef CONFIG_KEXEC_FILE
-struct kexec_sha_region {
- unsigned long start;
- unsigned long len;
-};
-
+#include <linux/purgatory.h>
void kimage_file_post_load_cleanup(struct kimage *image);
#else /* CONFIG_KEXEC_FILE */
static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index d45c96073afb..563f97e2be36 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -20,6 +20,8 @@
*/
#include <linux/module.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/binfmts.h>
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
@@ -516,7 +518,7 @@ static void helper_unlock(void)
* Function must be runnable in either a process context or the
* context in which call_usermodehelper_exec is called.
*/
-struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
+struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
char **envp, gfp_t gfp_mask,
int (*init)(struct subprocess_info *info, struct cred *new),
void (*cleanup)(struct subprocess_info *info),
@@ -528,7 +530,12 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
goto out;
INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
+
+#ifdef CONFIG_STATIC_USERMODEHELPER
+ sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH;
+#else
sub_info->path = path;
+#endif
sub_info->argv = argv;
sub_info->envp = envp;
@@ -566,6 +573,15 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
retval = -EBUSY;
goto out;
}
+
+ /*
+ * If there is no binary for us to call, then just return and get out of
+ * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and
+ * disable all call_usermodehelper() calls.
+ */
+ if (strlen(sub_info->path) == 0)
+ goto out;
+
/*
* Set the completion pointer only if there is a waiter.
* This makes it possible to use umh_complete to free
@@ -613,7 +629,7 @@ EXPORT_SYMBOL(call_usermodehelper_exec);
* This function is the equivalent to use call_usermodehelper_setup() and
* call_usermodehelper_exec().
*/
-int call_usermodehelper(char *path, char **argv, char **envp, int wait)
+int call_usermodehelper(const char *path, char **argv, char **envp, int wait)
{
struct subprocess_info *info;
gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ebb4dadca66b..699c5bc51a92 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1740,6 +1740,12 @@ void unregister_kprobes(struct kprobe **kps, int num)
}
EXPORT_SYMBOL_GPL(unregister_kprobes);
+int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ return NOTIFY_DONE;
+}
+
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
.priority = 0x7fffffff /* we need to be notified first */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index ee1bc1bb8feb..0999679d6f26 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -195,7 +195,7 @@ static ssize_t notes_read(struct file *filp, struct kobject *kobj,
return count;
}
-static struct bin_attribute notes_attr = {
+static struct bin_attribute notes_attr __ro_after_init = {
.attr = {
.name = "notes",
.mode = S_IRUGO,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8461a4372e8a..26db528c1d88 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -5,7 +5,9 @@
* even if we're invoked from userspace (think modprobe, hotplug cpu,
* etc.).
*/
+#include <uapi/linux/sched/types.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
@@ -18,6 +20,7 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
+#include <linux/cgroup.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -223,6 +226,7 @@ static int kthread(void *_create)
ret = -EINTR;
if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
+ cgroup_kthread_ready();
__kthread_parkme(self);
ret = threadfn(data);
}
@@ -536,6 +540,7 @@ int kthreadd(void *unused)
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
+ cgroup_init_kthreadd();
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index b5c30d9f46c5..96b4179cee6a 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -55,6 +55,8 @@
#include <linux/latencytop.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/stat.h>
#include <linux/list.h>
#include <linux/stacktrace.h>
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 9812e5dd409e..a95e5d1f4a9c 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -28,6 +28,8 @@
#define DISABLE_BRANCH_PROFILING
#include <linux/mutex.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/task.h>
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
@@ -3260,10 +3262,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (depth) {
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
- if (hlock->references)
+ if (hlock->references) {
+ /*
+ * Check: unsigned int references:12, overflow.
+ */
+ if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1))
+ return 0;
+
hlock->references++;
- else
+ } else {
hlock->references = 2;
+ }
return 1;
}
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index c2b88490d857..c08fbd2f5ba9 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -46,13 +46,13 @@ enum {
(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
/*
- * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text,
+ * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
* .data and .bss to fit in required 32MB limit for the kernel. With
- * PROVE_LOCKING we could go over this limit and cause system boot-up problems.
+ * CONFIG_LOCKDEP we could go over this limit and cause system boot-up problems.
* So, reduce the static allocations for lockdeps related structures so that
* everything fits in current required size limit.
*/
-#ifdef CONFIG_PROVE_LOCKING_SMALL
+#ifdef CONFIG_LOCKDEP_SMALL
/*
* MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
* we track.
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 28350dc8ecbb..f24582d4dad3 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -32,6 +32,8 @@
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/rtmutex.h>
#include <linux/atomic.h>
#include <linux/moduleparam.h>
#include <linux/delay.h>
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ad2d9e22697b..198527a62149 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -19,8 +19,10 @@
*/
#include <linux/mutex.h>
#include <linux/ww_mutex.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index e852be4851fc..4a30ef63c607 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -63,6 +63,7 @@ enum qlock_stats {
*/
#include <linux/debugfs.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/fs.h>
static const char * const qstat_names[qstat_num + 1] = {
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 62b6cee8ea7f..97ee9df32e0f 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -18,6 +18,7 @@
*/
#include <linux/sched.h>
#include <linux/sched/rt.h>
+#include <linux/sched/debug.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/spinlock.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index d340be3a488f..6edc32ecd9c5 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -12,9 +12,11 @@
*/
#include <linux/spinlock.h>
#include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
#include <linux/timer.h>
#include "rtmutex_common.h"
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 990134617b4c..856dfff5c33a 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -13,6 +13,7 @@
#define __KERNEL_RTMUTEX_COMMON_H
#include <linux/rtmutex.h>
+#include <linux/sched/wake_q.h>
/*
* This is the control structure for tasks blocked on a rt_mutex,
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 5eacab880f67..c65f7989f850 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -6,7 +6,8 @@
* - Derived also from comments by Linus
*/
#include <linux/rwsem.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/export.h>
enum rwsem_waiter_type {
@@ -212,10 +213,9 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
*/
if (sem->count == 0)
break;
- if (signal_pending_state(state, current)) {
- ret = -EINTR;
- goto out;
- }
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
set_current_state(state);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
schedule();
@@ -223,12 +223,19 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
}
/* got the lock */
sem->count = -1;
-out:
list_del(&waiter.list);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
return ret;
+
+out_nolock:
+ list_del(&waiter.list);
+ if (!list_empty(&sem->wait_list))
+ __rwsem_do_wake(sem, 1);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return -EINTR;
}
void __sched __down_write(struct rw_semaphore *sem)
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2ad8d8dc3bb1..34e727f18e49 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -10,10 +10,12 @@
* and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
*/
#include <linux/rwsem.h>
-#include <linux/sched.h>
#include <linux/init.h>
#include <linux/export.h>
+#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/debug.h>
#include <linux/osq_lock.h>
#include "rwsem.h"
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 45ba475d4be3..90a74ccd85a4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -7,6 +7,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9512e37637dc..561acdd39960 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index da6c9a34f62f..6b7abb334ca6 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -50,7 +50,7 @@ static void test_mutex_work(struct work_struct *work)
if (mtx->flags & TEST_MTX_TRY) {
while (!ww_mutex_trylock(&mtx->mutex))
- cpu_relax();
+ cond_resched();
} else {
ww_mutex_lock(&mtx->mutex, NULL);
}
@@ -88,7 +88,7 @@ static int __test_mutex(unsigned int flags)
ret = -EINVAL;
break;
}
- cpu_relax();
+ cond_resched();
} while (time_before(jiffies, timeout));
} else {
ret = wait_for_completion_timeout(&mtx.done, TIMEOUT);
@@ -627,7 +627,7 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = stress(4096, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL);
+ ret = stress(4095, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL);
if (ret)
return ret;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 9ecedc28b928..07e85e5229da 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -246,9 +246,11 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(resource_size(res), SECTION_SIZE);
+
mem_hotplug_begin();
arch_remove_memory(align_start, align_size);
mem_hotplug_done();
+
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
pgmap_radix_release(res);
dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
diff --git a/kernel/module.c b/kernel/module.c
index 3d8f126208e3..7eba6dea4f41 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -17,6 +17,7 @@
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/export.h>
+#include <linux/extable.h>
#include <linux/moduleloader.h>
#include <linux/trace_events.h>
#include <linux/init.h>
@@ -61,6 +62,7 @@
#include <linux/pfn.h>
#include <linux/bsearch.h>
#include <linux/dynamic_debug.h>
+#include <linux/audit.h>
#include <uapi/linux/module.h>
#include "module-internal.h"
@@ -74,9 +76,9 @@
/*
* Modules' sections will be aligned on page boundaries
* to ensure complete separation of code and data, but
- * only when CONFIG_DEBUG_SET_MODULE_RONX=y
+ * only when CONFIG_STRICT_MODULE_RWX=y
*/
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+#ifdef CONFIG_STRICT_MODULE_RWX
# define debug_align(X) ALIGN(X, PAGE_SIZE)
#else
# define debug_align(X) (X)
@@ -1844,7 +1846,7 @@ static void mod_sysfs_teardown(struct module *mod)
mod_sysfs_fini(mod);
}
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+#ifdef CONFIG_STRICT_MODULE_RWX
/*
* LKM RO/NX protection: protect module's text/ro-data
* from modification and any data from execution.
@@ -2809,6 +2811,8 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
if (get_modinfo(info, "livepatch")) {
mod->klp = true;
add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+ pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
+ mod->name);
}
return 0;
@@ -3608,6 +3612,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto free_copy;
}
+ audit_log_kern_module(mod->name);
+
/* Reserve our place in the list. */
err = add_unformed_module(mod);
if (err)
@@ -3696,7 +3702,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
mod->name, after_dashes);
}
- /* Link in to syfs. */
+ /* Link in to sysfs. */
err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
if (err < 0)
goto coming_cleanup;
@@ -3719,6 +3725,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
mod_sysfs_teardown(mod);
coming_cleanup:
mod->state = MODULE_STATE_GOING;
+ destroy_params(mod->kp, mod->num_kp);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
klp_module_going(mod);
@@ -4165,22 +4172,23 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
struct module *mod;
preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list) {
- if (mod->state == MODULE_STATE_UNFORMED)
- continue;
- if (mod->num_exentries == 0)
- continue;
+ mod = __module_address(addr);
+ if (!mod)
+ goto out;
- e = search_extable(mod->extable,
- mod->extable + mod->num_exentries - 1,
- addr);
- if (e)
- break;
- }
+ if (!mod->num_exentries)
+ goto out;
+
+ e = search_extable(mod->extable,
+ mod->extable + mod->num_exentries - 1,
+ addr);
+out:
preempt_enable();
- /* Now, if we found one, we are running inside it now, hence
- we cannot unload the module, hence no refcnt needed. */
+ /*
+ * Now, if we found one, we are running inside it now, hence
+ * we cannot unload the module, hence no refcnt needed.
+ */
return e;
}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index fd2c9acbcc19..6196af8a8223 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -95,7 +95,7 @@ static int notifier_call_chain(struct notifier_block **nl,
if (nr_calls)
(*nr_calls)++;
- if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
+ if (ret & NOTIFY_STOP_MASK)
break;
nb = next_nb;
nr_to_call--;
diff --git a/kernel/padata.c b/kernel/padata.c
index 05316c9f32da..3202aa17492c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -186,19 +186,20 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
reorder = &next_queue->reorder;
+ spin_lock(&reorder->lock);
if (!list_empty(&reorder->list)) {
padata = list_entry(reorder->list.next,
struct padata_priv, list);
- spin_lock(&reorder->lock);
list_del_init(&padata->list);
atomic_dec(&pd->reorder_objects);
- spin_unlock(&reorder->lock);
pd->processed++;
+ spin_unlock(&reorder->lock);
goto out;
}
+ spin_unlock(&reorder->lock);
if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
padata = ERR_PTR(-ENODATA);
diff --git a/kernel/panic.c b/kernel/panic.c
index 08aa88dde7de..a58932b41700 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -9,6 +9,7 @@
* to indicate a major problem.
*/
#include <linux/debug_locks.h>
+#include <linux/sched/debug.h>
#include <linux/interrupt.h>
#include <linux/kmsg_dump.h>
#include <linux/kallsyms.h>
@@ -188,7 +189,7 @@ void panic(const char *fmt, ...)
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (!_crash_kexec_post_notifiers) {
- printk_nmi_flush_on_panic();
+ printk_safe_flush_on_panic();
__crash_kexec(NULL);
/*
@@ -213,7 +214,7 @@ void panic(const char *fmt, ...)
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
/* Call flush even twice. It tries harder with a single online CPU */
- printk_nmi_flush_on_panic();
+ printk_safe_flush_on_panic();
kmsg_dump(KMSG_DUMP_PANIC);
/*
@@ -273,7 +274,8 @@ void panic(const char *fmt, ...)
extern int stop_a_enabled;
/* Make sure the user can actually press Stop-A (L1-A) */
stop_a_enabled = 1;
- pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n");
+ pr_emerg("Press Stop-A (L1-A) from sun keyboard or send break\n"
+ "twice on console to return to the boot prom\n");
}
#endif
#if defined(CONFIG_S390)
diff --git a/kernel/pid.c b/kernel/pid.c
index 0291804151b5..0143ac0ddceb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -38,6 +38,7 @@
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
+#include <linux/sched/task.h>
#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index eef2ce968636..de461aa0bf9a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,12 +12,15 @@
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/syscalls.h>
+#include <linux/cred.h>
#include <linux/err.h>
#include <linux/acct.h>
#include <linux/slab.h>
#include <linux/proc_ns.h>
#include <linux/reboot.h>
#include <linux/export.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
struct pid_cache {
int nr_ids;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26dbc48c75b..a8b978c35a6a 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -10,6 +10,8 @@
* This file is released under the GPLv2.
*/
+#define pr_fmt(fmt) "PM: " fmt
+
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/syscalls.h>
@@ -21,6 +23,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/pm.h>
+#include <linux/nmi.h>
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
@@ -104,7 +107,7 @@ EXPORT_SYMBOL(system_entering_hibernation);
#ifdef CONFIG_PM_DEBUG
static void hibernation_debug_sleep(void)
{
- printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n");
+ pr_info("hibernation debug: Waiting for 5 seconds.\n");
mdelay(5000);
}
@@ -250,10 +253,9 @@ void swsusp_show_speed(ktime_t start, ktime_t stop,
centisecs = 1; /* avoid div-by-zero */
k = nr_pages * (PAGE_SIZE / 1024);
kps = (k * 100) / centisecs;
- printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
- msg, k,
- centisecs / 100, centisecs % 100,
- kps / 1000, (kps % 1000) / 10);
+ pr_info("%s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
+ msg, k, centisecs / 100, centisecs % 100, kps / 1000,
+ (kps % 1000) / 10);
}
/**
@@ -271,8 +273,7 @@ static int create_image(int platform_mode)
error = dpm_suspend_end(PMSG_FREEZE);
if (error) {
- printk(KERN_ERR "PM: Some devices failed to power down, "
- "aborting hibernation\n");
+ pr_err("Some devices failed to power down, aborting hibernation\n");
return error;
}
@@ -288,8 +289,7 @@ static int create_image(int platform_mode)
error = syscore_suspend();
if (error) {
- printk(KERN_ERR "PM: Some system devices failed to power down, "
- "aborting hibernation\n");
+ pr_err("Some system devices failed to power down, aborting hibernation\n");
goto Enable_irqs;
}
@@ -304,8 +304,8 @@ static int create_image(int platform_mode)
restore_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
- printk(KERN_ERR "PM: Error %d creating hibernation image\n",
- error);
+ pr_err("Error %d creating hibernation image\n", error);
+
if (!in_suspend) {
events_check_enabled = false;
clear_free_pages();
@@ -432,8 +432,7 @@ static int resume_target_kernel(bool platform_mode)
error = dpm_suspend_end(PMSG_QUIESCE);
if (error) {
- printk(KERN_ERR "PM: Some devices failed to power down, "
- "aborting resume\n");
+ pr_err("Some devices failed to power down, aborting resume\n");
return error;
}
@@ -608,6 +607,22 @@ static void power_down(void)
{
#ifdef CONFIG_SUSPEND
int error;
+
+ if (hibernation_mode == HIBERNATION_SUSPEND) {
+ error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+ if (error) {
+ hibernation_mode = hibernation_ops ?
+ HIBERNATION_PLATFORM :
+ HIBERNATION_SHUTDOWN;
+ } else {
+ /* Restore swap signature. */
+ error = swsusp_unmark();
+ if (error)
+ pr_err("Swap will be unusable! Try swapon -a.\n");
+
+ return;
+ }
+ }
#endif
switch (hibernation_mode) {
@@ -620,32 +635,13 @@ static void power_down(void)
if (pm_power_off)
kernel_power_off();
break;
-#ifdef CONFIG_SUSPEND
- case HIBERNATION_SUSPEND:
- error = suspend_devices_and_enter(PM_SUSPEND_MEM);
- if (error) {
- if (hibernation_ops)
- hibernation_mode = HIBERNATION_PLATFORM;
- else
- hibernation_mode = HIBERNATION_SHUTDOWN;
- power_down();
- }
- /*
- * Restore swap signature.
- */
- error = swsusp_unmark();
- if (error)
- printk(KERN_ERR "PM: Swap will be unusable! "
- "Try swapon -a.\n");
- return;
-#endif
}
kernel_halt();
/*
* Valid image is on the disk, if we continue we risk serious data
* corruption after resume.
*/
- printk(KERN_CRIT "PM: Please power down manually\n");
+ pr_crit("Power down manually\n");
while (1)
cpu_relax();
}
@@ -655,7 +651,7 @@ static int load_image_and_restore(void)
int error;
unsigned int flags;
- pr_debug("PM: Loading hibernation image.\n");
+ pr_debug("Loading hibernation image.\n");
lock_device_hotplug();
error = create_basic_memory_bitmaps();
@@ -667,7 +663,7 @@ static int load_image_and_restore(void)
if (!error)
hibernation_restore(flags & SF_PLATFORM_MODE);
- printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
+ pr_err("Failed to load hibernation image, recovering.\n");
swsusp_free();
free_basic_memory_bitmaps();
Unlock:
@@ -685,7 +681,7 @@ int hibernate(void)
bool snapshot_test = false;
if (!hibernation_available()) {
- pr_debug("PM: Hibernation not available.\n");
+ pr_debug("Hibernation not available.\n");
return -EPERM;
}
@@ -703,9 +699,9 @@ int hibernate(void)
goto Exit;
}
- printk(KERN_INFO "PM: Syncing filesystems ... ");
+ pr_info("Syncing filesystems ... \n");
sys_sync();
- printk("done.\n");
+ pr_info("done.\n");
error = freeze_processes();
if (error)
@@ -731,7 +727,7 @@ int hibernate(void)
else
flags |= SF_CRC32_MODE;
- pr_debug("PM: writing image.\n");
+ pr_debug("Writing image.\n");
error = swsusp_write(flags);
swsusp_free();
if (!error) {
@@ -743,7 +739,7 @@ int hibernate(void)
in_suspend = 0;
pm_restore_gfp_mask();
} else {
- pr_debug("PM: Image restored successfully.\n");
+ pr_debug("Image restored successfully.\n");
}
Free_bitmaps:
@@ -751,7 +747,7 @@ int hibernate(void)
Thaw:
unlock_device_hotplug();
if (snapshot_test) {
- pr_debug("PM: Checking hibernation image\n");
+ pr_debug("Checking hibernation image\n");
error = swsusp_check();
if (!error)
error = load_image_and_restore();
@@ -815,10 +811,10 @@ static int software_resume(void)
goto Unlock;
}
- pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
+ pr_debug("Checking hibernation image partition %s\n", resume_file);
if (resume_delay) {
- printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
+ pr_info("Waiting %dsec before reading resume device ...\n",
resume_delay);
ssleep(resume_delay);
}
@@ -857,10 +853,10 @@ static int software_resume(void)
}
Check_image:
- pr_debug("PM: Hibernation image partition %d:%d present\n",
+ pr_debug("Hibernation image partition %d:%d present\n",
MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
- pr_debug("PM: Looking for hibernation image.\n");
+ pr_debug("Looking for hibernation image.\n");
error = swsusp_check();
if (error)
goto Unlock;
@@ -879,7 +875,7 @@ static int software_resume(void)
goto Close_Finish;
}
- pr_debug("PM: Preparing processes for restore.\n");
+ pr_debug("Preparing processes for restore.\n");
error = freeze_processes();
if (error)
goto Close_Finish;
@@ -892,7 +888,7 @@ static int software_resume(void)
/* For success case, the suspend path will release the lock */
Unlock:
mutex_unlock(&pm_mutex);
- pr_debug("PM: Hibernation image not present or could not be loaded.\n");
+ pr_debug("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
swsusp_close(FMODE_READ);
@@ -1016,7 +1012,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
error = -EINVAL;
if (!error)
- pr_debug("PM: Hibernation mode set to '%s'\n",
+ pr_debug("Hibernation mode set to '%s'\n",
hibernation_modes[mode]);
unlock_system_sleep();
return error ? error : n;
@@ -1052,7 +1048,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
lock_system_sleep();
swsusp_resume_device = res;
unlock_system_sleep();
- printk(KERN_INFO "PM: Starting manual resume from disk\n");
+ pr_info("Starting manual resume from disk\n");
noresume = 0;
software_resume();
return n;
@@ -1156,7 +1152,7 @@ static int __init hibernate_setup(char *str)
} else if (!strncmp(str, "no", 2)) {
noresume = 1;
nohibernate = 1;
- } else if (IS_ENABLED(CONFIG_DEBUG_RODATA)
+ } else if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)
&& !strncmp(str, "protect_image", 13)) {
enable_restore_image_protection();
}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 1dfa0da827d3..7fdc40d31b7d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -61,12 +61,12 @@ extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
extern int hibernation_platform_enter(void);
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
/* kernel/power/snapshot.c */
extern void enable_restore_image_protection(void);
#else
static inline void enable_restore_image_protection(void) {}
-#endif /* CONFIG_DEBUG_RODATA */
+#endif /* CONFIG_STRICT_KERNEL_RWX */
#else /* !CONFIG_HIBERNATION */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 2fba066e125f..c7209f060eeb 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -12,6 +12,8 @@
#include <linux/oom.h>
#include <linux/suspend.h>
#include <linux/module.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
#include <linux/syscalls.h>
#include <linux/freezer.h>
#include <linux/delay.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 2d8e2b227db8..d79a38de425a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -22,6 +22,7 @@
#include <linux/device.h>
#include <linux/init.h>
#include <linux/bootmem.h>
+#include <linux/nmi.h>
#include <linux/syscalls.h>
#include <linux/console.h>
#include <linux/highmem.h>
@@ -38,7 +39,7 @@
#include "power.h"
-#ifdef CONFIG_DEBUG_RODATA
+#ifdef CONFIG_STRICT_KERNEL_RWX
static bool hibernate_restore_protection;
static bool hibernate_restore_protection_active;
@@ -73,7 +74,7 @@ static inline void hibernate_restore_protection_begin(void) {}
static inline void hibernate_restore_protection_end(void) {}
static inline void hibernate_restore_protect_page(void *page_address) {}
static inline void hibernate_restore_unprotect_page(void *page_address) {}
-#endif /* CONFIG_DEBUG_RODATA */
+#endif /* CONFIG_STRICT_KERNEL_RWX */
static int swsusp_page_is_free(struct page *);
static void swsusp_set_page_forbidden(struct page *);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index bdff5ed57f10..5db217051232 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -166,7 +166,7 @@ static int __init setup_test_suspend(char *value)
return 0;
}
- for (i = 0; pm_labels[i]; i++)
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
if (!strcmp(pm_labels[i], suspend_type)) {
test_state_label = pm_labels[i];
return 0;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 32e0c232efba..f80fd33639e0 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -201,7 +201,7 @@ void free_all_swap_pages(int swap)
struct swsusp_extent *ext;
unsigned long offset;
- ext = container_of(node, struct swsusp_extent, node);
+ ext = rb_entry(node, struct swsusp_extent, node);
rb_erase(node, &swsusp_extents);
for (offset = ext->start; offset <= ext->end; offset++)
swap_free(swp_entry(swap, offset));
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index abb0042a427b..4a2ffc39eb95 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,3 +1,3 @@
obj-y = printk.o
-obj-$(CONFIG_PRINTK_NMI) += nmi.o
+obj-$(CONFIG_PRINTK) += printk_safe.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 7fd2838fa417..1db044f808b7 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -16,42 +16,55 @@
*/
#include <linux/percpu.h>
-typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK
-int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
-
-#ifdef CONFIG_PRINTK_NMI
+#define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff
+#define PRINTK_NMI_CONTEXT_MASK 0x80000000
extern raw_spinlock_t logbuf_lock;
+__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
+__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
+void __printk_safe_enter(void);
+void __printk_safe_exit(void);
+
+#define printk_safe_enter_irqsave(flags) \
+ do { \
+ local_irq_save(flags); \
+ __printk_safe_enter(); \
+ } while (0)
+
+#define printk_safe_exit_irqrestore(flags) \
+ do { \
+ __printk_safe_exit(); \
+ local_irq_restore(flags); \
+ } while (0)
+
+#define printk_safe_enter_irq() \
+ do { \
+ local_irq_disable(); \
+ __printk_safe_enter(); \
+ } while (0)
+
+#define printk_safe_exit_irq() \
+ do { \
+ __printk_safe_exit(); \
+ local_irq_enable(); \
+ } while (0)
+
+#else
+
+__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
+
/*
- * printk() could not take logbuf_lock in NMI context. Instead,
- * it temporary stores the strings into a per-CPU buffer.
- * The alternative implementation is chosen transparently
- * via per-CPU variable.
+ * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
+ * semaphore and some of console functions (console_unlock()/etc.), so
+ * printk-safe must preserve the existing local IRQ guarantees.
*/
-DECLARE_PER_CPU(printk_func_t, printk_func);
-static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
-{
- return this_cpu_read(printk_func)(fmt, args);
-}
-
-extern atomic_t nmi_message_lost;
-static inline int get_nmi_message_lost(void)
-{
- return atomic_xchg(&nmi_message_lost, 0);
-}
-
-#else /* CONFIG_PRINTK_NMI */
-
-static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
-{
- return vprintk_default(fmt, args);
-}
-
-static inline int get_nmi_message_lost(void)
-{
- return 0;
-}
-
-#endif /* CONFIG_PRINTK_NMI */
+#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
+#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
+
+#define printk_safe_enter_irq() local_irq_disable()
+#define printk_safe_exit_irq() local_irq_enable()
+
+#endif /* CONFIG_PRINTK */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 4ba3d34938c0..2984fb0f0257 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,9 @@
#include <linux/utsname.h>
#include <linux/ctype.h>
#include <linux/uio.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
#include <linux/uaccess.h>
#include <asm/sections.h>
@@ -213,17 +216,36 @@ static int nr_ext_console_drivers;
static int __down_trylock_console_sem(unsigned long ip)
{
- if (down_trylock(&console_sem))
+ int lock_failed;
+ unsigned long flags;
+
+ /*
+ * Here and in __up_console_sem() we need to be in safe mode,
+ * because spindump/WARN/etc from under console ->lock will
+ * deadlock in printk()->down_trylock_console_sem() otherwise.
+ */
+ printk_safe_enter_irqsave(flags);
+ lock_failed = down_trylock(&console_sem);
+ printk_safe_exit_irqrestore(flags);
+
+ if (lock_failed)
return 1;
mutex_acquire(&console_lock_dep_map, 0, 1, ip);
return 0;
}
#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
-#define up_console_sem() do { \
- mutex_release(&console_lock_dep_map, 1, _RET_IP_);\
- up(&console_sem);\
-} while (0)
+static void __up_console_sem(unsigned long ip)
+{
+ unsigned long flags;
+
+ mutex_release(&console_lock_dep_map, 1, ip);
+
+ printk_safe_enter_irqsave(flags);
+ up(&console_sem);
+ printk_safe_exit_irqrestore(flags);
+}
+#define up_console_sem() __up_console_sem(_RET_IP_)
/*
* This is used for debugging the mess that is the VT code by
@@ -351,6 +373,34 @@ __packed __aligned(4)
*/
DEFINE_RAW_SPINLOCK(logbuf_lock);
+/*
+ * Helper macros to lock/unlock logbuf_lock and switch between
+ * printk-safe/unsafe modes.
+ */
+#define logbuf_lock_irq() \
+ do { \
+ printk_safe_enter_irq(); \
+ raw_spin_lock(&logbuf_lock); \
+ } while (0)
+
+#define logbuf_unlock_irq() \
+ do { \
+ raw_spin_unlock(&logbuf_lock); \
+ printk_safe_exit_irq(); \
+ } while (0)
+
+#define logbuf_lock_irqsave(flags) \
+ do { \
+ printk_safe_enter_irqsave(flags); \
+ raw_spin_lock(&logbuf_lock); \
+ } while (0)
+
+#define logbuf_unlock_irqrestore(flags) \
+ do { \
+ raw_spin_unlock(&logbuf_lock); \
+ printk_safe_exit_irqrestore(flags); \
+ } while (0)
+
#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -782,20 +832,21 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
ret = mutex_lock_interruptible(&user->lock);
if (ret)
return ret;
- raw_spin_lock_irq(&logbuf_lock);
+
+ logbuf_lock_irq();
while (user->seq == log_next_seq) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
goto out;
}
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
ret = wait_event_interruptible(log_wait,
user->seq != log_next_seq);
if (ret)
goto out;
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
}
if (user->seq < log_first_seq) {
@@ -803,7 +854,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
user->idx = log_first_idx;
user->seq = log_first_seq;
ret = -EPIPE;
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
goto out;
}
@@ -816,7 +867,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
user->idx = log_next(user->idx);
user->seq++;
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
if (len > count) {
ret = -EINVAL;
@@ -843,7 +894,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
if (offset)
return -ESPIPE;
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
switch (whence) {
case SEEK_SET:
/* the first record */
@@ -867,7 +918,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
default:
ret = -EINVAL;
}
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
return ret;
}
@@ -881,7 +932,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
poll_wait(file, &log_wait, wait);
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
if (user->seq < log_next_seq) {
/* return error when data has vanished underneath us */
if (user->seq < log_first_seq)
@@ -889,7 +940,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
else
ret = POLLIN|POLLRDNORM;
}
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
return ret;
}
@@ -919,10 +970,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)
mutex_init(&user->lock);
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
user->idx = log_first_idx;
user->seq = log_first_seq;
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
file->private_data = user;
return 0;
@@ -1064,13 +1115,13 @@ void __init setup_log_buf(int early)
return;
}
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ logbuf_lock_irqsave(flags);
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
free = __LOG_BUF_LEN - log_next_idx;
memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
pr_info("log_buf_len: %d bytes\n", log_buf_len);
pr_info("early log buf free: %d(%d%%)\n",
@@ -1248,7 +1299,7 @@ static int syslog_print(char __user *buf, int size)
size_t n;
size_t skip;
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
if (syslog_seq < log_first_seq) {
/* messages are gone, move to first one */
syslog_seq = log_first_seq;
@@ -1256,7 +1307,7 @@ static int syslog_print(char __user *buf, int size)
syslog_partial = 0;
}
if (syslog_seq == log_next_seq) {
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
break;
}
@@ -1275,7 +1326,7 @@ static int syslog_print(char __user *buf, int size)
syslog_partial += n;
} else
n = 0;
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
if (!n)
break;
@@ -1304,7 +1355,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
if (!text)
return -ENOMEM;
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
if (buf) {
u64 next_seq;
u64 seq;
@@ -1352,12 +1403,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
idx = log_next(idx);
seq++;
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
if (seq < log_first_seq) {
/* messages are gone, move to next one */
@@ -1371,7 +1422,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
clear_seq = log_next_seq;
clear_idx = log_next_idx;
}
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
kfree(text);
return len;
@@ -1458,7 +1509,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- raw_spin_lock_irq(&logbuf_lock);
+ logbuf_lock_irq();
if (syslog_seq < log_first_seq) {
/* messages are gone, move to first one */
syslog_seq = log_first_seq;
@@ -1486,7 +1537,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
}
error -= syslog_partial;
}
- raw_spin_unlock_irq(&logbuf_lock);
+ logbuf_unlock_irq();
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
@@ -1510,8 +1561,7 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
* log_buf[start] to log_buf[end - 1].
* The console_lock must be held.
*/
-static void call_console_drivers(int level,
- const char *ext_text, size_t ext_len,
+static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len)
{
struct console *con;
@@ -1538,28 +1588,6 @@ static void call_console_drivers(int level,
}
}
-/*
- * Zap console related locks when oopsing.
- * To leave time for slow consoles to print a full oops,
- * only zap at most once every 30 seconds.
- */
-static void zap_locks(void)
-{
- static unsigned long oops_timestamp;
-
- if (time_after_eq(jiffies, oops_timestamp) &&
- !time_after(jiffies, oops_timestamp + 30 * HZ))
- return;
-
- oops_timestamp = jiffies;
-
- debug_locks_off();
- /* If a crash is occurring, make sure we can't deadlock */
- raw_spin_lock_init(&logbuf_lock);
- /* And make sure that we print immediately */
- sema_init(&console_sem, 1);
-}
-
int printk_delay_msec __read_mostly;
static inline void printk_delay(void)
@@ -1669,18 +1697,13 @@ asmlinkage int vprintk_emit(int facility, int level,
const char *dict, size_t dictlen,
const char *fmt, va_list args)
{
- static bool recursion_bug;
static char textbuf[LOG_LINE_MAX];
char *text = textbuf;
size_t text_len = 0;
enum log_flags lflags = 0;
unsigned long flags;
- int this_cpu;
int printed_len = 0;
- int nmi_message_lost;
bool in_sched = false;
- /* cpu currently holding logbuf_lock in this function */
- static unsigned int logbuf_cpu = UINT_MAX;
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
@@ -1690,53 +1713,8 @@ asmlinkage int vprintk_emit(int facility, int level,
boot_delay_msec(level);
printk_delay();
- local_irq_save(flags);
- this_cpu = smp_processor_id();
-
- /*
- * Ouch, printk recursed into itself!
- */
- if (unlikely(logbuf_cpu == this_cpu)) {
- /*
- * If a crash is occurring during printk() on this CPU,
- * then try to get the crash message out but make sure
- * we can't deadlock. Otherwise just return to avoid the
- * recursion and return - but flag the recursion so that
- * it can be printed at the next appropriate moment:
- */
- if (!oops_in_progress && !lockdep_recursing(current)) {
- recursion_bug = true;
- local_irq_restore(flags);
- return 0;
- }
- zap_locks();
- }
-
- lockdep_off();
/* This stops the holder of console_sem just where we want him */
- raw_spin_lock(&logbuf_lock);
- logbuf_cpu = this_cpu;
-
- if (unlikely(recursion_bug)) {
- static const char recursion_msg[] =
- "BUG: recent printk recursion!";
-
- recursion_bug = false;
- /* emit KERN_CRIT message */
- printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
- NULL, 0, recursion_msg,
- strlen(recursion_msg));
- }
-
- nmi_message_lost = get_nmi_message_lost();
- if (unlikely(nmi_message_lost)) {
- text_len = scnprintf(textbuf, sizeof(textbuf),
- "BAD LUCK: lost %d message(s) from NMI context!",
- nmi_message_lost);
- printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
- NULL, 0, textbuf, text_len);
- }
-
+ logbuf_lock_irqsave(flags);
/*
* The printf needs to come first; we need the syslog
* prefix which might be passed-in as a parameter.
@@ -1779,14 +1757,10 @@ asmlinkage int vprintk_emit(int facility, int level,
printed_len += log_output(facility, level, lflags, dict, dictlen, text, text_len);
- logbuf_cpu = UINT_MAX;
- raw_spin_unlock(&logbuf_lock);
- lockdep_on();
- local_irq_restore(flags);
+ logbuf_unlock_irqrestore(flags);
/* If called from the scheduler, we can not call up(). */
if (!in_sched) {
- lockdep_off();
/*
* Try to acquire and then immediately release the console
* semaphore. The release will print out buffers and wake up
@@ -1794,7 +1768,6 @@ asmlinkage int vprintk_emit(int facility, int level,
*/
if (console_trylock())
console_unlock();
- lockdep_on();
}
return printed_len;
@@ -1803,7 +1776,7 @@ EXPORT_SYMBOL(vprintk_emit);
asmlinkage int vprintk(const char *fmt, va_list args)
{
- return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+ return vprintk_func(fmt, args);
}
EXPORT_SYMBOL(vprintk);
@@ -1895,16 +1868,12 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
static ssize_t msg_print_ext_body(char *buf, size_t size,
char *dict, size_t dict_len,
char *text, size_t text_len) { return 0; }
-static void call_console_drivers(int level,
- const char *ext_text, size_t ext_len,
+static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
static size_t msg_print_text(const struct printk_log *msg,
bool syslog, char *buf, size_t size) { return 0; }
static bool suppress_message_printing(int level) { return false; }
-/* Still needs to be defined for users */
-DEFINE_PER_CPU(printk_func_t, printk_func);
-
#endif /* CONFIG_PRINTK */
#ifdef CONFIG_EARLY_PRINTK
@@ -2220,9 +2189,9 @@ again:
struct printk_log *msg;
size_t ext_len = 0;
size_t len;
- int level;
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ printk_safe_enter_irqsave(flags);
+ raw_spin_lock(&logbuf_lock);
if (seen_seq != log_next_seq) {
wake_klogd = true;
seen_seq = log_next_seq;
@@ -2243,8 +2212,7 @@ skip:
break;
msg = log_from_idx(console_idx);
- level = msg->level;
- if (suppress_message_printing(level)) {
+ if (suppress_message_printing(msg->level)) {
/*
* Skip record we have buffered and already printed
* directly to the console when we received it, and
@@ -2270,9 +2238,9 @@ skip:
raw_spin_unlock(&logbuf_lock);
stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(level, ext_text, ext_len, text, len);
+ call_console_drivers(ext_text, ext_len, text, len);
start_critical_timings();
- local_irq_restore(flags);
+ printk_safe_exit_irqrestore(flags);
if (do_cond_resched)
cond_resched();
@@ -2295,7 +2263,8 @@ skip:
*/
raw_spin_lock(&logbuf_lock);
retry = console_seq != log_next_seq;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ raw_spin_unlock(&logbuf_lock);
+ printk_safe_exit_irqrestore(flags);
if (retry && console_trylock())
goto again;
@@ -2558,10 +2527,10 @@ void register_console(struct console *newcon)
* console_unlock(); will print out the buffered messages
* for us.
*/
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ logbuf_lock_irqsave(flags);
console_seq = syslog_seq;
console_idx = syslog_idx;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
/*
* We're about to replay the log buffer. Only do this to the
* just-registered console to avoid excessive message spam to
@@ -2860,12 +2829,12 @@ void kmsg_dump(enum kmsg_dump_reason reason)
/* initialize iterator with data about the stored records */
dumper->active = true;
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ logbuf_lock_irqsave(flags);
dumper->cur_seq = clear_seq;
dumper->cur_idx = clear_idx;
dumper->next_seq = log_next_seq;
dumper->next_idx = log_next_idx;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
/* invoke dumper which will iterate over records */
dumper->dump(dumper, reason);
@@ -2950,9 +2919,9 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
unsigned long flags;
bool ret;
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ logbuf_lock_irqsave(flags);
ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
return ret;
}
@@ -2991,7 +2960,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
if (!dumper->active)
goto out;
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ logbuf_lock_irqsave(flags);
if (dumper->cur_seq < log_first_seq) {
/* messages are gone, move to first available one */
dumper->cur_seq = log_first_seq;
@@ -3000,7 +2969,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
/* last entry */
if (dumper->cur_seq >= dumper->next_seq) {
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
goto out;
}
@@ -3042,7 +3011,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
dumper->next_seq = next_seq;
dumper->next_idx = next_idx;
ret = true;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
out:
if (len)
*len = l;
@@ -3080,9 +3049,9 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
{
unsigned long flags;
- raw_spin_lock_irqsave(&logbuf_lock, flags);
+ logbuf_lock_irqsave(flags);
kmsg_dump_rewind_nolock(dumper);
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+ logbuf_unlock_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
diff --git a/kernel/printk/nmi.c b/kernel/printk/printk_safe.c
index f011aaef583c..033e50a7d706 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/printk_safe.c
@@ -1,5 +1,5 @@
/*
- * nmi.c - Safe printk in NMI context
+ * printk_safe.c - Safe printk for printk-deadlock-prone contexts
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -32,36 +32,58 @@
* is later flushed into the main ring buffer via IRQ work.
*
* The alternative implementation is chosen transparently
- * via @printk_func per-CPU variable.
+ * by examinig current printk() context mask stored in @printk_context
+ * per-CPU variable.
*
* The implementation allows to flush the strings also from another CPU.
* There are situations when we want to make sure that all buffers
* were handled or when IRQs are blocked.
*/
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
-static int printk_nmi_irq_ready;
-atomic_t nmi_message_lost;
+static int printk_safe_irq_ready;
-#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \
- sizeof(atomic_t) - sizeof(struct irq_work))
+#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
+ sizeof(atomic_t) - \
+ sizeof(atomic_t) - \
+ sizeof(struct irq_work))
-struct nmi_seq_buf {
+struct printk_safe_seq_buf {
atomic_t len; /* length of written data */
+ atomic_t message_lost;
struct irq_work work; /* IRQ work that flushes the buffer */
- unsigned char buffer[NMI_LOG_BUF_LEN];
+ unsigned char buffer[SAFE_LOG_BUF_LEN];
};
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
+static DEFINE_PER_CPU(int, printk_context);
+
+#ifdef CONFIG_PRINTK_NMI
+static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
+#endif
+
+/* Get flushed in a more safe context. */
+static void queue_flush_work(struct printk_safe_seq_buf *s)
+{
+ if (printk_safe_irq_ready) {
+ /* Make sure that IRQ work is really initialized. */
+ smp_rmb();
+ irq_work_queue(&s->work);
+ }
+}
/*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
+ * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
+ * have dedicated buffers, because otherwise printk-safe preempted by
+ * NMI-printk would have overwritten the NMI messages.
+ *
+ * The messages are fushed from irq work (or from panic()), possibly,
+ * from other CPU, concurrently with printk_safe_log_store(). Should this
+ * happen, printk_safe_log_store() will notice the buffer->len mismatch
+ * and repeat the write.
*/
-static int vprintk_nmi(const char *fmt, va_list args)
+static int printk_safe_log_store(struct printk_safe_seq_buf *s,
+ const char *fmt, va_list args)
{
- struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- int add = 0;
+ int add;
size_t len;
again:
@@ -69,18 +91,21 @@ again:
/* The trailing '\0' is not counted into len. */
if (len >= sizeof(s->buffer) - 1) {
- atomic_inc(&nmi_message_lost);
+ atomic_inc(&s->message_lost);
+ queue_flush_work(s);
return 0;
}
/*
- * Make sure that all old data have been read before the buffer was
- * reseted. This is not needed when we just append data.
+ * Make sure that all old data have been read before the buffer
+ * was reset. This is not needed when we just append data.
*/
if (!len)
smp_rmb();
add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+ if (!add)
+ return 0;
/*
* Do it once again if the buffer has been flushed in the meantime.
@@ -90,32 +115,23 @@ again:
if (atomic_cmpxchg(&s->len, len, len + add) != len)
goto again;
- /* Get flushed in a more safe context. */
- if (add && printk_nmi_irq_ready) {
- /* Make sure that IRQ work is really initialized. */
- smp_rmb();
- irq_work_queue(&s->work);
- }
-
+ queue_flush_work(s);
return add;
}
-static void printk_nmi_flush_line(const char *text, int len)
+static inline void printk_safe_flush_line(const char *text, int len)
{
/*
- * The buffers are flushed in NMI only on panic. The messages must
- * go only into the ring buffer at this stage. Consoles will get
- * explicitly called later when a crashdump is not generated.
+ * Avoid any console drivers calls from here, because we may be
+ * in NMI or printk_safe context (when in panic). The messages
+ * must go only into the ring buffer at this stage. Consoles will
+ * get explicitly called later when a crashdump is not generated.
*/
- if (in_nmi())
- printk_deferred("%.*s", len, text);
- else
- printk("%.*s", len, text);
-
+ printk_deferred("%.*s", len, text);
}
/* printk part of the temporary buffer line by line */
-static int printk_nmi_flush_buffer(const char *start, size_t len)
+static int printk_safe_flush_buffer(const char *start, size_t len)
{
const char *c, *end;
bool header;
@@ -127,7 +143,7 @@ static int printk_nmi_flush_buffer(const char *start, size_t len)
/* Print line by line. */
while (c < end) {
if (*c == '\n') {
- printk_nmi_flush_line(start, c - start + 1);
+ printk_safe_flush_line(start, c - start + 1);
start = ++c;
header = true;
continue;
@@ -140,7 +156,7 @@ static int printk_nmi_flush_buffer(const char *start, size_t len)
continue;
}
- printk_nmi_flush_line(start, c - start);
+ printk_safe_flush_line(start, c - start);
start = c++;
header = true;
continue;
@@ -154,22 +170,31 @@ static int printk_nmi_flush_buffer(const char *start, size_t len)
if (start < end && !header) {
static const char newline[] = KERN_CONT "\n";
- printk_nmi_flush_line(start, end - start);
- printk_nmi_flush_line(newline, strlen(newline));
+ printk_safe_flush_line(start, end - start);
+ printk_safe_flush_line(newline, strlen(newline));
}
return len;
}
+static void report_message_lost(struct printk_safe_seq_buf *s)
+{
+ int lost = atomic_xchg(&s->message_lost, 0);
+
+ if (lost)
+ printk_deferred("Lost %d message(s)!\n", lost);
+}
+
/*
- * Flush data from the associated per_CPU buffer. The function
+ * Flush data from the associated per-CPU buffer. The function
* can be called either via IRQ work or independently.
*/
-static void __printk_nmi_flush(struct irq_work *work)
+static void __printk_safe_flush(struct irq_work *work)
{
static raw_spinlock_t read_lock =
__RAW_SPIN_LOCK_INITIALIZER(read_lock);
- struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+ struct printk_safe_seq_buf *s =
+ container_of(work, struct printk_safe_seq_buf, work);
unsigned long flags;
size_t len;
int i;
@@ -194,9 +219,9 @@ more:
* buffer size.
*/
if ((i && i >= len) || len > sizeof(s->buffer)) {
- const char *msg = "printk_nmi_flush: internal error\n";
+ const char *msg = "printk_safe_flush: internal error\n";
- printk_nmi_flush_line(msg, strlen(msg));
+ printk_safe_flush_line(msg, strlen(msg));
len = 0;
}
@@ -205,7 +230,7 @@ more:
/* Make sure that data has been written up to the @len */
smp_rmb();
- i += printk_nmi_flush_buffer(s->buffer + i, len - i);
+ i += printk_safe_flush_buffer(s->buffer + i, len - i);
/*
* Check that nothing has got added in the meantime and truncate
@@ -217,35 +242,40 @@ more:
goto more;
out:
+ report_message_lost(s);
raw_spin_unlock_irqrestore(&read_lock, flags);
}
/**
- * printk_nmi_flush - flush all per-cpu nmi buffers.
+ * printk_safe_flush - flush all per-cpu nmi buffers.
*
* The buffers are flushed automatically via IRQ work. This function
* is useful only when someone wants to be sure that all buffers have
* been flushed at some point.
*/
-void printk_nmi_flush(void)
+void printk_safe_flush(void)
{
int cpu;
- for_each_possible_cpu(cpu)
- __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+ for_each_possible_cpu(cpu) {
+#ifdef CONFIG_PRINTK_NMI
+ __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
+#endif
+ __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work);
+ }
}
/**
- * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system
+ * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system
* goes down.
*
- * Similar to printk_nmi_flush() but it can be called even in NMI context when
+ * Similar to printk_safe_flush() but it can be called even in NMI context when
* the system goes down. It does the best effort to get NMI messages into
* the main ring buffer.
*
* Note that it could try harder when there is only one CPU online.
*/
-void printk_nmi_flush_on_panic(void)
+void printk_safe_flush_on_panic(void)
{
/*
* Make sure that we could access the main ring buffer.
@@ -259,33 +289,97 @@ void printk_nmi_flush_on_panic(void)
raw_spin_lock_init(&logbuf_lock);
}
- printk_nmi_flush();
+ printk_safe_flush();
}
-void __init printk_nmi_init(void)
+#ifdef CONFIG_PRINTK_NMI
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
{
- int cpu;
+ struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- for_each_possible_cpu(cpu) {
- struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+ return printk_safe_log_store(s, fmt, args);
+}
- init_irq_work(&s->work, __printk_nmi_flush);
- }
+void printk_nmi_enter(void)
+{
+ this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
+}
- /* Make sure that IRQ works are initialized before enabling. */
- smp_wmb();
- printk_nmi_irq_ready = 1;
+void printk_nmi_exit(void)
+{
+ this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
+}
- /* Flush pending messages that did not have scheduled IRQ works. */
- printk_nmi_flush();
+#else
+
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+ return 0;
}
-void printk_nmi_enter(void)
+#endif /* CONFIG_PRINTK_NMI */
+
+/*
+ * Lock-less printk(), to avoid deadlocks should the printk() recurse
+ * into itself. It uses a per-CPU buffer to store the message, just like
+ * NMI.
+ */
+static int vprintk_safe(const char *fmt, va_list args)
{
- this_cpu_write(printk_func, vprintk_nmi);
+ struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
+
+ return printk_safe_log_store(s, fmt, args);
}
-void printk_nmi_exit(void)
+/* Can be preempted by NMI. */
+void __printk_safe_enter(void)
+{
+ this_cpu_inc(printk_context);
+}
+
+/* Can be preempted by NMI. */
+void __printk_safe_exit(void)
{
- this_cpu_write(printk_func, vprintk_default);
+ this_cpu_dec(printk_context);
+}
+
+__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+ if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
+ return vprintk_nmi(fmt, args);
+
+ if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
+ return vprintk_safe(fmt, args);
+
+ return vprintk_default(fmt, args);
+}
+
+void __init printk_safe_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct printk_safe_seq_buf *s;
+
+ s = &per_cpu(safe_print_seq, cpu);
+ init_irq_work(&s->work, __printk_safe_flush);
+
+#ifdef CONFIG_PRINTK_NMI
+ s = &per_cpu(nmi_print_seq, cpu);
+ init_irq_work(&s->work, __printk_safe_flush);
+#endif
+ }
+
+ /* Make sure that IRQ works are initialized before enabling. */
+ smp_wmb();
+ printk_safe_irq_ready = 1;
+
+ /* Flush pending messages that did not have scheduled IRQ works. */
+ printk_safe_flush();
}
diff --git a/kernel/profile.c b/kernel/profile.c
index f67ce0aa6bc4..9aa2a4445b0d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -25,6 +25,8 @@
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/sched/stat.h>
+
#include <asm/sections.h>
#include <asm/irq_regs.h>
#include <asm/ptrace.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 49ba7c1ade9d..266ddcc1d8bb 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -10,6 +10,9 @@
#include <linux/capability.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/highmem.h>
@@ -181,11 +184,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
WARN_ON(!task->ptrace || task->parent != current);
+ /*
+ * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely.
+ * Recheck state under the lock to close this race.
+ */
spin_lock_irq(&task->sighand->siglock);
- if (__fatal_signal_pending(task))
- wake_up_state(task, __TASK_TRACED);
- else
- task->state = TASK_TRACED;
+ if (task->state == __TASK_TRACED) {
+ if (__fatal_signal_pending(task))
+ wake_up_state(task, __TASK_TRACED);
+ else
+ task->state = TASK_TRACED;
+ }
spin_unlock_irq(&task->sighand->siglock);
}
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 123ccbd22449..a4a86fb47e4a 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -30,6 +30,7 @@
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/completion.h>
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d81345be730e..cccc417a8135 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -32,7 +32,8 @@
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <uapi/linux/sched/types.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/completion.h>
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e773129c8b08..ef3bcfb15b39 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -30,7 +30,7 @@
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
#include <linux/sched.h>
#include <linux/smp.h>
#include <linux/delay.h>
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index fa6a48d3917b..6ad330dbbae2 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -25,7 +25,7 @@
#include <linux/completion.h>
#include <linux/interrupt.h>
#include <linux/notifier.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mutex.h>
@@ -47,6 +47,18 @@ static void __call_rcu(struct rcu_head *head,
#include "tiny_plugin.h"
+void rcu_barrier_bh(void)
+{
+ wait_rcu_gp(call_rcu_bh);
+}
+EXPORT_SYMBOL(rcu_barrier_bh);
+
+void rcu_barrier_sched(void)
+{
+ wait_rcu_gp(call_rcu_sched);
+}
+EXPORT_SYMBOL(rcu_barrier_sched);
+
#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d80e0d2f68c6..50fee7689e71 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -32,9 +32,10 @@
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
-#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
#include <linux/nmi.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
@@ -49,6 +50,7 @@
#include <linux/kernel_stat.h>
#include <linux/wait.h>
#include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
#include <linux/prefetch.h>
#include <linux/delay.h>
#include <linux/stop_machine.h>
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index b60f2b6caa14..ec62a05bfdb3 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -24,6 +24,7 @@
#include <linux/cache.h>
#include <linux/spinlock.h>
+#include <linux/rtmutex.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a240f3308be6..0a62a8f1caac 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -27,7 +27,9 @@
#include <linux/delay.h>
#include <linux/gfp.h>
#include <linux/oom.h>
+#include <linux/sched/debug.h>
#include <linux/smpboot.h>
+#include <uapi/linux/sched/types.h>
#include "../time/tick-internal.h"
#ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 9e03db9ea9c0..55c8530316c7 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -36,7 +36,8 @@
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/percpu.h>
@@ -49,6 +50,7 @@
#include <linux/moduleparam.h>
#include <linux/kthread.h>
#include <linux/tick.h>
+#include <linux/rcupdate_wait.h>
#define CREATE_TRACE_POINTS
diff --git a/kernel/relay.c b/kernel/relay.c
index 8f18d314a96a..0e413d9eec8a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -39,10 +39,10 @@ static void relay_file_mmap_close(struct vm_area_struct *vma)
/*
* fault() vm_op implementation for relay file mapping.
*/
-static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int relay_buf_fault(struct vm_fault *vmf)
{
struct page *page;
- struct rchan_buf *buf = vma->vm_private_data;
+ struct rchan_buf *buf = vmf->vma->vm_private_data;
pgoff_t pgoff = vmf->pgoff;
if (!buf)
@@ -847,7 +847,7 @@ void relay_close(struct rchan *chan)
if (chan->last_toobig)
printk(KERN_WARNING "relay: one or more items not logged "
- "[item size (%Zd) > sub-buffer size (%Zd)]\n",
+ "[item size (%zd) > sub-buffer size (%zd)]\n",
chan->last_toobig, chan->subbuf_size);
list_del(&chan->list);
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 890c95f2587a..ce40c810cd5c 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -2,6 +2,7 @@
#include <linux/kref.h>
#include <linux/rwsem.h>
+#include <linux/sched/autogroup.h>
struct autogroup {
/*
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index ad64efe41722..00a45c45beca 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -58,6 +58,8 @@
#include <linux/percpu.h>
#include <linux/ktime.h>
#include <linux/sched.h>
+#include <linux/nmi.h>
+#include <linux/sched/clock.h>
#include <linux/static_key.h>
#include <linux/workqueue.h>
#include <linux/compiler.h>
@@ -94,10 +96,10 @@ static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
static int __sched_clock_stable_early = 1;
/*
- * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset
+ * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset
*/
-static __read_mostly u64 raw_offset;
-static __read_mostly u64 gtod_offset;
+__read_mostly u64 __sched_clock_offset;
+static __read_mostly u64 __gtod_offset;
struct sched_clock_data {
u64 tick_raw;
@@ -129,17 +131,24 @@ static void __set_sched_clock_stable(void)
/*
* Attempt to make the (initial) unstable->stable transition continuous.
*/
- raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw);
+ __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
- scd->tick_gtod, gtod_offset,
- scd->tick_raw, raw_offset);
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);
static_branch_enable(&__sched_clock_stable);
tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
-static void __clear_sched_clock_stable(struct work_struct *work)
+static void __sched_clock_work(struct work_struct *work)
+{
+ static_branch_disable(&__sched_clock_stable);
+}
+
+static DECLARE_WORK(sched_clock_work, __sched_clock_work);
+
+static void __clear_sched_clock_stable(void)
{
struct sched_clock_data *scd = this_scd();
@@ -152,17 +161,17 @@ static void __clear_sched_clock_stable(struct work_struct *work)
*
* Still do what we can.
*/
- gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod);
+ __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod);
printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
- scd->tick_gtod, gtod_offset,
- scd->tick_raw, raw_offset);
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);
- static_branch_disable(&__sched_clock_stable);
tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
-}
-static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
+ if (sched_clock_stable())
+ schedule_work(&sched_clock_work);
+}
void clear_sched_clock_stable(void)
{
@@ -171,7 +180,7 @@ void clear_sched_clock_stable(void)
smp_mb(); /* matches sched_clock_init_late() */
if (sched_clock_running == 2)
- schedule_work(&sched_clock_work);
+ __clear_sched_clock_stable();
}
void sched_clock_init_late(void)
@@ -212,7 +221,7 @@ static inline u64 wrap_max(u64 x, u64 y)
*/
static u64 sched_clock_local(struct sched_clock_data *scd)
{
- u64 now, clock, old_clock, min_clock, max_clock;
+ u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
again:
@@ -229,9 +238,10 @@ again:
* scd->tick_gtod + TICK_NSEC);
*/
- clock = scd->tick_gtod + gtod_offset + delta;
- min_clock = wrap_max(scd->tick_gtod, old_clock);
- max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
+ gtod = scd->tick_gtod + __gtod_offset;
+ clock = gtod + delta;
+ min_clock = wrap_max(gtod, old_clock);
+ max_clock = wrap_max(old_clock, gtod + TICK_NSEC);
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
@@ -315,7 +325,7 @@ u64 sched_clock_cpu(int cpu)
u64 clock;
if (sched_clock_stable())
- return sched_clock() + raw_offset;
+ return sched_clock() + __sched_clock_offset;
if (unlikely(!sched_clock_running))
return 0ull;
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index f063a25d4449..53f9558fa925 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -11,7 +11,8 @@
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/completion.h>
/**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 34e2291a9a6c..3b31fc05a0f1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,10 +6,15 @@
* Copyright (C) 1991-2002 Linus Torvalds
*/
#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/hotplug.h>
#include <linux/cpuset.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
#include <linux/context_tracking.h>
+#include <linux/rcupdate_wait.h>
#include <linux/blkdev.h>
#include <linux/kprobes.h>
@@ -23,6 +28,9 @@
#include <asm/switch_to.h>
#include <asm/tlb.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
#include "sched.h"
#include "../workqueue_internal.h"
@@ -978,7 +986,7 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
return rq;
/* Affinity changed (again). */
- if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
return rq;
rq = move_queued_task(rq, p, dest_cpu);
@@ -1087,6 +1095,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
int ret = 0;
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
if (p->flags & PF_KTHREAD) {
/*
@@ -1255,10 +1264,10 @@ static int migrate_swap_stop(void *data)
if (task_cpu(arg->src_task) != arg->src_cpu)
goto unlock;
- if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+ if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
goto unlock;
- if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+ if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
goto unlock;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1299,10 +1308,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
- if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+ if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
goto out;
- if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+ if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
goto out;
trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1486,14 +1495,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for_each_cpu(dest_cpu, nodemask) {
if (!cpu_active(dest_cpu))
continue;
- if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
return dest_cpu;
}
}
for (;;) {
/* Any allowed, online CPU? */
- for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+ for_each_cpu(dest_cpu, &p->cpus_allowed) {
if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
continue;
if (!cpu_online(dest_cpu))
@@ -1545,10 +1554,10 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
lockdep_assert_held(&p->pi_lock);
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
else
- cpu = cpumask_any(tsk_cpus_allowed(p));
+ cpu = cpumask_any(&p->cpus_allowed);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1560,7 +1569,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
* [ this allows ->select_task() to simply return task_cpu(p) and
* not worry about this generic constraint ]
*/
- if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
+ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
!cpu_online(cpu)))
cpu = select_fallback_rq(task_cpu(p), p);
@@ -2844,7 +2853,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
if (!mm) {
next->active_mm = oldmm;
- atomic_inc(&oldmm->mm_count);
+ mmgrab(oldmm);
enter_lazy_tlb(oldmm, next);
} else
switch_mm_irqs_off(oldmm, mm, next);
@@ -3207,6 +3216,15 @@ static inline void preempt_latency_start(int val) { }
static inline void preempt_latency_stop(int val) { }
#endif
+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+ return p->preempt_disable_ip;
+#else
+ return 0;
+#endif
+}
+
/*
* Print scheduling while atomic bug:
*/
@@ -3269,10 +3287,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *p;
/*
- * Optimization: we know that if all tasks are in
- * the fair class we can call that function directly:
+ * Optimization: we know that if all tasks are in the fair class we can
+ * call that function directly, but only if the @prev task wasn't of a
+ * higher scheduling class, because otherwise those loose the
+ * opportunity to pull in more work from other CPUs.
*/
- if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+ if (likely((prev->sched_class == &idle_sched_class ||
+ prev->sched_class == &fair_sched_class) &&
+ rq->nr_running == rq->cfs.h_nr_running)) {
+
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto again;
@@ -5229,6 +5252,9 @@ void sched_show_task(struct task_struct *p)
int ppid;
unsigned long state = p->state;
+ /* Make sure the string lines up properly with the number of task states: */
+ BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
+
if (!try_get_task_stack(p))
return;
if (state)
@@ -5457,7 +5483,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (curr_cpu == target_cpu)
return 0;
- if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+ if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
return -EINVAL;
/* TODO: This is not properly updating schedstats */
@@ -5557,7 +5583,7 @@ static void migrate_tasks(struct rq *dead_rq)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
- struct rq_flags rf, old_rf;
+ struct rq_flags rf;
int dest_cpu;
/*
@@ -5576,7 +5602,9 @@ static void migrate_tasks(struct rq *dead_rq)
* class method both need to have an up-to-date
* value of rq->clock[_task]
*/
+ rq_pin_lock(rq, &rf);
update_rq_clock(rq);
+ rq_unpin_lock(rq, &rf);
for (;;) {
/*
@@ -5589,7 +5617,7 @@ static void migrate_tasks(struct rq *dead_rq)
/*
* pick_next_task() assumes pinned rq->lock:
*/
- rq_pin_lock(rq, &rf);
+ rq_repin_lock(rq, &rf);
next = pick_next_task(rq, &fake_task, &rf);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -5618,13 +5646,6 @@ static void migrate_tasks(struct rq *dead_rq)
continue;
}
- /*
- * __migrate_task() may return with a different
- * rq->lock held and a new cookie in 'rf', but we need
- * to preserve rf::clock_update_flags for 'dead_rq'.
- */
- old_rf = rf;
-
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
@@ -5633,7 +5654,6 @@ static void migrate_tasks(struct rq *dead_rq)
raw_spin_unlock(&rq->lock);
rq = dead_rq;
raw_spin_lock(&rq->lock);
- rf = old_rf;
}
raw_spin_unlock(&next->pi_lock);
}
@@ -6095,7 +6115,7 @@ void __init sched_init(void)
/*
* The boot idle thread does lazy MMU switching as well:
*/
- atomic_inc(&init_mm.mm_count);
+ mmgrab(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*
@@ -6816,11 +6836,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
- sched_online_group(tg, parent);
-
return &tg->css;
}
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -7226,6 +7255,7 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.fork = cpu_cgroup_fork,
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index e73119013c53..fba235c7d026 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -128,10 +128,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
- cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) {
+ cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
best_cpu = cpumask_any(later_mask);
goto out;
- } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) &&
+ } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
best_cpu = cpudl_maximum(cp);
if (later_mask)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index fd4659313640..54c577578da6 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -13,6 +13,7 @@
#include <linux/cpufreq.h>
#include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
#include <linux/slab.h>
#include <trace/events/power.h>
@@ -35,6 +36,7 @@ struct sugov_policy {
u64 last_freq_update_time;
s64 freq_update_delay_ns;
unsigned int next_freq;
+ unsigned int cached_raw_freq;
/* The next fields are only needed if fast switch cannot be used. */
struct irq_work irq_work;
@@ -51,7 +53,6 @@ struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
- unsigned int cached_raw_freq;
unsigned long iowait_boost;
unsigned long iowait_boost_max;
u64 last_update;
@@ -115,7 +116,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
/**
* get_next_freq - Compute a new frequency for a given cpufreq policy.
- * @sg_cpu: schedutil cpu object to compute the new frequency for.
+ * @sg_policy: schedutil policy object to compute the new frequency for.
* @util: Current CPU utilization.
* @max: CPU capacity.
*
@@ -135,19 +136,18 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
* next_freq (as calculated above) is returned, subject to policy min/max and
* cpufreq driver limitations.
*/
-static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
- unsigned long max)
+static unsigned int get_next_freq(struct sugov_policy *sg_policy,
+ unsigned long util, unsigned long max)
{
- struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
freq = (freq + (freq >> 2)) * util / max;
- if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
return sg_policy->next_freq;
- sg_cpu->cached_raw_freq = freq;
+ sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
@@ -212,7 +212,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
} else {
sugov_get_util(&util, &max);
sugov_iowait_boost(sg_cpu, &util, &max);
- next_f = get_next_freq(sg_cpu, util, max);
+ next_f = get_next_freq(sg_policy, util, max);
}
sugov_update_commit(sg_policy, time, next_f);
}
@@ -266,7 +266,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
sugov_iowait_boost(j_sg_cpu, &util, &max);
}
- return get_next_freq(sg_cpu, util, max);
+ return get_next_freq(sg_policy, util, max);
}
static void sugov_update_shared(struct update_util_data *hook, u64 time,
@@ -579,25 +579,19 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->next_freq = UINT_MAX;
sg_policy->work_in_progress = false;
sg_policy->need_freq_update = false;
+ sg_policy->cached_raw_freq = 0;
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+ memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->sg_policy = sg_policy;
- if (policy_is_shared(policy)) {
- sg_cpu->util = 0;
- sg_cpu->max = 0;
- sg_cpu->flags = SCHED_CPUFREQ_RT;
- sg_cpu->last_update = 0;
- sg_cpu->cached_raw_freq = 0;
- sg_cpu->iowait_boost = 0;
- sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
- cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
- sugov_update_shared);
- } else {
- cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
- sugov_update_single);
- }
+ sg_cpu->flags = SCHED_CPUFREQ_RT;
+ sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+ policy_is_shared(policy) ?
+ sugov_update_shared :
+ sugov_update_single);
}
return 0;
}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 11e9705bf937..981fcd7dc394 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
continue;
- if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
+ if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
continue;
if (lowest_mask) {
- cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
+ cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
/*
* We have to ensure that we have at least one bit
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 2ecec3a4f1ee..f3778e2b46c8 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,12 +4,8 @@
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
#include <linux/context_tracking.h>
-#include <linux/cputime.h>
+#include <linux/sched/cputime.h>
#include "sched.h"
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
-
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 27737f34757d..a2ce59015642 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
struct task_struct *p = dl_task_of(dl_se);
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
dl_rq->dl_nr_migratory++;
update_dl_migration(dl_rq);
@@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
struct task_struct *p = dl_task_of(dl_se);
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
dl_rq->dl_nr_migratory--;
update_dl_migration(dl_rq);
@@ -252,7 +252,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* If we cannot preempt any rq, fall back to pick any
* online cpu.
*/
- cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
+ cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
if (cpu >= nr_cpu_ids) {
/*
* Fail to find any suitable cpu.
@@ -445,13 +445,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
*
* This function returns true if:
*
- * runtime / (deadline - t) > dl_runtime / dl_period ,
+ * runtime / (deadline - t) > dl_runtime / dl_deadline ,
*
* IOW we can't recycle current parameters.
*
- * Notice that the bandwidth check is done against the period. For
+ * Notice that the bandwidth check is done against the deadline. For
* task with deadline equal to period this is the same of using
- * dl_deadline instead of dl_period in the equation above.
+ * dl_period instead of dl_deadline in the equation above.
*/
static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se, u64 t)
@@ -476,7 +476,7 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
* of anything below microseconds resolution is actually fiction
* (but still we want to give the user that illusion >;).
*/
- left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
right = ((dl_se->deadline - t) >> DL_SCALE) *
(pi_se->dl_runtime >> DL_SCALE);
@@ -505,10 +505,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
}
}
+static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
+{
+ return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period;
+}
+
/*
* If the entity depleted all its runtime, and if we want it to sleep
* while waiting for some new execution time to become available, we
- * set the bandwidth enforcement timer to the replenishment instant
+ * set the bandwidth replenishment timer to the replenishment instant
* and try to activate it.
*
* Notice that it is important for the caller to know if the timer
@@ -530,7 +535,7 @@ static int start_dl_timer(struct task_struct *p)
* that it is actually coming from rq->clock and not from
* hrtimer's time base reading.
*/
- act = ns_to_ktime(dl_se->deadline);
+ act = ns_to_ktime(dl_next_period(dl_se));
now = hrtimer_cb_get_time(timer);
delta = ktime_to_ns(now) - rq_clock(rq);
act = ktime_add_ns(act, delta);
@@ -638,6 +643,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
lockdep_unpin_lock(&rq->lock, rf.cookie);
rq = dl_task_offline_migration(rq, p);
rf.cookie = lockdep_pin_lock(&rq->lock);
+ update_rq_clock(rq);
/*
* Now that the task has been migrated to the new RQ and we
@@ -689,6 +695,37 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
timer->function = dl_task_timer;
}
+/*
+ * During the activation, CBS checks if it can reuse the current task's
+ * runtime and period. If the deadline of the task is in the past, CBS
+ * cannot use the runtime, and so it replenishes the task. This rule
+ * works fine for implicit deadline tasks (deadline == period), and the
+ * CBS was designed for implicit deadline tasks. However, a task with
+ * constrained deadline (deadine < period) might be awakened after the
+ * deadline, but before the next period. In this case, replenishing the
+ * task would allow it to run for runtime / deadline. As in this case
+ * deadline < period, CBS enables a task to run for more than the
+ * runtime / period. In a very loaded system, this can cause a domino
+ * effect, making other tasks miss their deadlines.
+ *
+ * To avoid this problem, in the activation of a constrained deadline
+ * task after the deadline but before the next period, throttle the
+ * task and set the replenishing timer to the begin of the next period,
+ * unless it is boosted.
+ */
+static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+
+ if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
+ if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+ return;
+ dl_se->dl_throttled = 1;
+ }
+}
+
static
int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
{
@@ -922,6 +959,11 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
__dequeue_dl_entity(dl_se);
}
+static inline bool dl_is_constrained(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_deadline < dl_se->dl_period;
+}
+
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *pi_task = rt_mutex_get_top_task(p);
@@ -948,6 +990,15 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
}
/*
+ * Check if a constrained deadline task was activated
+ * after the deadline but before the next period.
+ * If that is the case, the task will be throttled and
+ * the replenishment timer will be set to the next period.
+ */
+ if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
+ dl_check_constrained_dl(&p->dl);
+
+ /*
* If p is throttled, we do nothing. In fact, if it exhausted
* its budget it needs a replenishment and, since it now is on
* its rq, the bandwidth timer callback (which clearly has not
@@ -958,7 +1009,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
enqueue_dl_entity(&p->dl, pi_se, flags);
- if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1032,9 +1083,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
* try to make it stay here, it might be important.
*/
if (unlikely(dl_task(curr)) &&
- (tsk_nr_cpus_allowed(curr) < 2 ||
+ (curr->nr_cpus_allowed < 2 ||
!dl_entity_preempt(&p->dl, &curr->dl)) &&
- (tsk_nr_cpus_allowed(p) > 1)) {
+ (p->nr_cpus_allowed > 1)) {
int target = find_later_rq(p);
if (target != -1 &&
@@ -1055,7 +1106,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* Current can't be migrated, useless to reschedule,
* let's hope p can move out.
*/
- if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
+ if (rq->curr->nr_cpus_allowed == 1 ||
cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
return;
@@ -1063,7 +1114,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (tsk_nr_cpus_allowed(p) != 1 &&
+ if (p->nr_cpus_allowed != 1 &&
cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
return;
@@ -1178,7 +1229,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
update_curr_dl(rq);
- if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
+ if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1235,7 +1286,7 @@ static void set_curr_task_dl(struct rq *rq)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ cpumask_test_cpu(cpu, &p->cpus_allowed))
return 1;
return 0;
}
@@ -1279,7 +1330,7 @@ static int find_later_rq(struct task_struct *task)
if (unlikely(!later_mask))
return -1;
- if (tsk_nr_cpus_allowed(task) == 1)
+ if (task->nr_cpus_allowed == 1)
return -1;
/*
@@ -1384,8 +1435,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(later_rq->cpu,
- tsk_cpus_allowed(task)) ||
+ !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
task_running(rq, task) ||
!dl_task(task) ||
!task_on_rq_queued(task))) {
@@ -1425,7 +1475,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
+ BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!task_on_rq_queued(p));
BUG_ON(!dl_task(p));
@@ -1464,7 +1514,7 @@ retry:
*/
if (dl_task(rq->curr) &&
dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
- tsk_nr_cpus_allowed(rq->curr) > 1) {
+ rq->curr->nr_cpus_allowed > 1) {
resched_curr(rq);
return 0;
}
@@ -1611,9 +1661,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- tsk_nr_cpus_allowed(p) > 1 &&
+ p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
- (tsk_nr_cpus_allowed(rq->curr) < 2 ||
+ (rq->curr->nr_cpus_allowed < 2 ||
!dl_entity_preempt(&p->dl, &rq->curr->dl))) {
push_dl_tasks(rq);
}
@@ -1727,7 +1777,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (rq->curr != p) {
#ifdef CONFIG_SMP
- if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
queue_push_tasks(rq);
#endif
if (dl_task(rq->curr))
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 109adc0e9cb9..38f019324f1a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -11,7 +11,8 @@
*/
#include <linux/proc_fs.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/utsname.h>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 274c747a01ce..dea138964b91 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,7 +20,9 @@
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/topology.h>
+
#include <linux/latencytop.h>
#include <linux/cpumask.h>
#include <linux/cpuidle.h>
@@ -1551,7 +1553,7 @@ static void task_numa_compare(struct task_numa_env *env,
*/
if (cur) {
/* Skip this swap candidate if cannot move to the source cpu */
- if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+ if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
goto unlock;
/*
@@ -1661,7 +1663,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
- if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+ if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
continue;
env->dst_cpu = cpu;
@@ -5458,7 +5460,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_cpus(group),
- tsk_cpus_allowed(p)))
+ &p->cpus_allowed))
continue;
local_group = cpumask_test_cpu(this_cpu,
@@ -5578,7 +5580,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return cpumask_first(sched_group_cpus(group));
/* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
+ for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5717,7 +5719,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
if (!test_idle_cores(target, false))
return -1;
- cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+ cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
for_each_cpu_wrap(core, cpus, target, wrap) {
bool idle = true;
@@ -5751,7 +5753,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
if (idle_cpu(cpu))
return cpu;
@@ -5797,13 +5799,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
* Due to large variance we need a large fuzz factor; hackbench in
* particularly is sensitive here.
*/
- if ((avg_idle / 512) < avg_cost)
+ if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost)
return -1;
time = local_clock();
for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
- if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue;
if (idle_cpu(cpu))
break;
@@ -5958,7 +5960,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
- && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ && cpumask_test_cpu(cpu, &p->cpus_allowed);
}
rcu_read_lock();
@@ -6698,7 +6700,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
- if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
int cpu;
schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -6718,7 +6720,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Prevent to re-select dst_cpu via env's cpus */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
@@ -7252,7 +7254,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
/*
* Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to tsk_cpus_allowed() constraints.
+ * groups is inadequate due to ->cpus_allowed constraints.
*
* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
@@ -8211,8 +8213,7 @@ more_balance:
* if the curr task on busiest cpu can't be
* moved to this_cpu
*/
- if (!cpumask_test_cpu(this_cpu,
- tsk_cpus_allowed(busiest->curr))) {
+ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 69631fa46c2f..1b3c8189b286 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -51,6 +51,11 @@ SCHED_FEAT(NONTASK_CAPACITY, true)
*/
SCHED_FEAT(TTWU_QUEUE, true)
+/*
+ * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
+ */
+SCHED_FEAT(SIS_AVG_CPU, false)
+
#ifdef HAVE_RT_PUSH_IPI
/*
* In order to avoid a thundering herd attack of CPUs that are
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6a4bae0a649d..ac6d5176463d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -2,6 +2,7 @@
* Generic entry point for the idle threads
*/
#include <linux/sched.h>
+#include <linux/sched/idle.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
#include <linux/cpuhotplug.h>
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index a2d6eb71f06b..f15fb2bdbc0d 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -7,6 +7,7 @@
*/
#include <linux/export.h>
+#include <linux/sched/loadavg.h>
#include "sched.h"
@@ -168,7 +169,7 @@ static inline int calc_load_write_idx(void)
* If the folding window started, make sure we start writing in the
* next idle-delta.
*/
- if (!time_before(jiffies, calc_load_update))
+ if (!time_before(jiffies, READ_ONCE(calc_load_update)))
idx++;
return idx & 1;
@@ -201,8 +202,9 @@ void calc_load_exit_idle(void)
struct rq *this_rq = this_rq();
/*
- * If we're still before the sample window, we're done.
+ * If we're still before the pending sample window, we're done.
*/
+ this_rq->calc_load_update = READ_ONCE(calc_load_update);
if (time_before(jiffies, this_rq->calc_load_update))
return;
@@ -211,7 +213,6 @@ void calc_load_exit_idle(void)
* accounted through the nohz accounting, so skip the entire deal and
* sync up for the next window.
*/
- this_rq->calc_load_update = calc_load_update;
if (time_before(jiffies, this_rq->calc_load_update + 10))
this_rq->calc_load_update += LOAD_FREQ;
}
@@ -307,13 +308,15 @@ calc_load_n(unsigned long load, unsigned long exp,
*/
static void calc_global_nohz(void)
{
+ unsigned long sample_window;
long delta, active, n;
- if (!time_before(jiffies, calc_load_update + 10)) {
+ sample_window = READ_ONCE(calc_load_update);
+ if (!time_before(jiffies, sample_window + 10)) {
/*
* Catch-up, fold however many we are behind still
*/
- delta = jiffies - calc_load_update - 10;
+ delta = jiffies - sample_window - 10;
n = 1 + (delta / LOAD_FREQ);
active = atomic_long_read(&calc_load_tasks);
@@ -323,7 +326,7 @@ static void calc_global_nohz(void)
avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
- calc_load_update += n * LOAD_FREQ;
+ WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
}
/*
@@ -351,9 +354,11 @@ static inline void calc_global_nohz(void) { }
*/
void calc_global_load(unsigned long ticks)
{
+ unsigned long sample_window;
long active, delta;
- if (time_before(jiffies, calc_load_update + 10))
+ sample_window = READ_ONCE(calc_load_update);
+ if (time_before(jiffies, sample_window + 10))
return;
/*
@@ -370,7 +375,7 @@ void calc_global_load(unsigned long ticks)
avenrun[1] = calc_load(avenrun[1], EXP_5, active);
avenrun[2] = calc_load(avenrun[2], EXP_15, active);
- calc_load_update += LOAD_FREQ;
+ WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
/*
* In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e8836cfc4cdb..9f3e40226dec 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -335,7 +335,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total++;
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory++;
update_rt_migration(rt_rq);
@@ -352,7 +352,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total--;
- if (tsk_nr_cpus_allowed(p) > 1)
+ if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory--;
update_rt_migration(rt_rq);
@@ -1324,7 +1324,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_rt_entity(rt_se, flags);
- if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1413,7 +1413,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
* will have to sort it out.
*/
if (curr && unlikely(rt_task(curr)) &&
- (tsk_nr_cpus_allowed(curr) < 2 ||
+ (curr->nr_cpus_allowed < 2 ||
curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
@@ -1437,7 +1437,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* Current can't be migrated, useless to reschedule,
* let's hope p can move out.
*/
- if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
+ if (rq->curr->nr_cpus_allowed == 1 ||
!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
return;
@@ -1445,7 +1445,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (tsk_nr_cpus_allowed(p) != 1
+ if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
@@ -1579,7 +1579,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* The previous task needs to be made eligible for pushing
* if it is still active
*/
- if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
+ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1591,7 +1591,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ cpumask_test_cpu(cpu, &p->cpus_allowed))
return 1;
return 0;
}
@@ -1629,7 +1629,7 @@ static int find_lowest_rq(struct task_struct *task)
if (unlikely(!lowest_mask))
return -1;
- if (tsk_nr_cpus_allowed(task) == 1)
+ if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1726,8 +1726,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(lowest_rq->cpu,
- tsk_cpus_allowed(task)) ||
+ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
task_running(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
@@ -1762,7 +1761,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
+ BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!task_on_rq_queued(p));
BUG_ON(!rt_task(p));
@@ -2122,9 +2121,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- tsk_nr_cpus_allowed(p) > 1 &&
+ p->nr_cpus_allowed > 1 &&
(dl_task(rq->curr) || rt_task(rq->curr)) &&
- (tsk_nr_cpus_allowed(rq->curr) < 2 ||
+ (rq->curr->nr_cpus_allowed < 2 ||
rq->curr->prio <= p->prio))
push_rt_tasks(rq);
}
@@ -2197,7 +2196,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 71b10a9b73cf..5cbf92214ad8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,9 +1,26 @@
#include <linux/sched.h>
+#include <linux/sched/autogroup.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/topology.h>
#include <linux/sched/rt.h>
-#include <linux/u64_stats_sync.h>
#include <linux/sched/deadline.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/cpufreq.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/init.h>
+
+#include <linux/u64_stats_sync.h>
#include <linux/kernel_stat.h>
#include <linux/binfmts.h>
#include <linux/mutex.h>
@@ -13,6 +30,10 @@
#include <linux/tick.h>
#include <linux/slab.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
+
#include "cpupri.h"
#include "cpudeadline.h"
#include "cpuacct.h"
@@ -1817,7 +1838,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void print_dl_stats(struct seq_file *m, int cpu);
extern void
print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
-
#ifdef CONFIG_NUMA_BALANCING
extern void
show_numa_stats(struct task_struct *p, struct seq_file *m);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index bf0da0aa0a14..d5710651043b 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -164,114 +164,3 @@ sched_info_switch(struct rq *rq,
#define sched_info_arrive(rq, next) do { } while (0)
#define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHED_INFO */
-
-/*
- * The following are functions that support scheduler-internal time accounting.
- * These functions are generally called at the timer tick. None of this depends
- * on CONFIG_SCHEDSTATS.
- */
-
-/**
- * get_running_cputimer - return &tsk->signal->cputimer if cputimer is running
- *
- * @tsk: Pointer to target task.
- */
-#ifdef CONFIG_POSIX_TIMERS
-static inline
-struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
-{
- struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-
- /* Check if cputimer isn't running. This is accessed without locking. */
- if (!READ_ONCE(cputimer->running))
- return NULL;
-
- /*
- * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
- * in __exit_signal(), we won't account to the signal struct further
- * cputime consumed by that task, even though the task can still be
- * ticking after __exit_signal().
- *
- * In order to keep a consistent behaviour between thread group cputime
- * and thread group cputimer accounting, lets also ignore the cputime
- * elapsing after __exit_signal() in any thread group timer running.
- *
- * This makes sure that POSIX CPU clocks and timers are synchronized, so
- * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
- * clock delta is behind the expiring timer value.
- */
- if (unlikely(!tsk->sighand))
- return NULL;
-
- return cputimer;
-}
-#else
-static inline
-struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk)
-{
- return NULL;
-}
-#endif
-
-/**
- * account_group_user_time - Maintain utime for a thread group.
- *
- * @tsk: Pointer to task structure.
- * @cputime: Time value by which to increment the utime field of the
- * thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the utime field there.
- */
-static inline void account_group_user_time(struct task_struct *tsk,
- u64 cputime)
-{
- struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
- if (!cputimer)
- return;
-
- atomic64_add(cputime, &cputimer->cputime_atomic.utime);
-}
-
-/**
- * account_group_system_time - Maintain stime for a thread group.
- *
- * @tsk: Pointer to task structure.
- * @cputime: Time value by which to increment the stime field of the
- * thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the stime field there.
- */
-static inline void account_group_system_time(struct task_struct *tsk,
- u64 cputime)
-{
- struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
- if (!cputimer)
- return;
-
- atomic64_add(cputime, &cputimer->cputime_atomic.stime);
-}
-
-/**
- * account_group_exec_runtime - Maintain exec runtime for a thread group.
- *
- * @tsk: Pointer to task structure.
- * @ns: Time value by which to increment the sum_exec_runtime field
- * of the thread_group_cputime structure.
- *
- * If thread group time is being maintained, get the structure for the
- * running CPU and update the sum_exec_runtime field there.
- */
-static inline void account_group_exec_runtime(struct task_struct *tsk,
- unsigned long long ns)
-{
- struct thread_group_cputimer *cputimer = get_running_cputimer(tsk);
-
- if (!cputimer)
- return;
-
- atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
-}
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 82f0dff90030..3d5610dcce11 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,4 +1,4 @@
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/swait.h>
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 9453efe9b25a..b8c84c6dee64 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -5,7 +5,8 @@
*/
#include <linux/init.h>
#include <linux/export.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/mm.h>
#include <linux/wait.h>
#include <linux/hash.h>
@@ -241,6 +242,45 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait_event);
+/*
+ * Note! These two wait functions are entered with the
+ * wait-queue lock held (and interrupts off in the _irq
+ * case), so there is no race with testing the wakeup
+ * condition in the caller before they add the wait
+ * entry to the wake queue.
+ */
+int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait)
+{
+ if (likely(list_empty(&wait->task_list)))
+ __add_wait_queue_tail(wq, wait);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
+ spin_unlock(&wq->lock);
+ schedule();
+ spin_lock(&wq->lock);
+ return 0;
+}
+EXPORT_SYMBOL(do_wait_intr);
+
+int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait)
+{
+ if (likely(list_empty(&wait->task_list)))
+ __add_wait_queue_tail(wq, wait);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
+ spin_unlock_irq(&wq->lock);
+ schedule();
+ spin_lock_irq(&wq->lock);
+ return 0;
+}
+EXPORT_SYMBOL(do_wait_intr_irq);
+
/**
* finish_wait - clean up after waiting in a queue
* @q: waitqueue waited on
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f7ce79a46050..65f61077ad50 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -16,7 +16,9 @@
#include <linux/atomic.h>
#include <linux/audit.h>
#include <linux/compat.h>
+#include <linux/coredump.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/seccomp.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
@@ -486,6 +488,17 @@ void put_seccomp_filter(struct task_struct *tsk)
}
}
+static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
+{
+ memset(info, 0, sizeof(*info));
+ info->si_signo = SIGSYS;
+ info->si_code = SYS_SECCOMP;
+ info->si_call_addr = (void __user *)KSTK_EIP(current);
+ info->si_errno = reason;
+ info->si_arch = syscall_get_arch();
+ info->si_syscall = syscall;
+}
+
/**
* seccomp_send_sigsys - signals the task to allow in-process syscall emulation
* @syscall: syscall number to send to userland
@@ -496,13 +509,7 @@ void put_seccomp_filter(struct task_struct *tsk)
static void seccomp_send_sigsys(int syscall, int reason)
{
struct siginfo info;
- memset(&info, 0, sizeof(info));
- info.si_signo = SIGSYS;
- info.si_code = SYS_SECCOMP;
- info.si_call_addr = (void __user *)KSTK_EIP(current);
- info.si_errno = reason;
- info.si_arch = syscall_get_arch();
- info.si_syscall = syscall;
+ seccomp_init_siginfo(&info, syscall, reason);
force_sig_info(SIGSYS, &info, current);
}
#endif /* CONFIG_SECCOMP_FILTER */
@@ -634,10 +641,20 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
return 0;
case SECCOMP_RET_KILL:
- default:
+ default: {
+ siginfo_t info;
audit_seccomp(this_syscall, SIGSYS, action);
+ /* Dump core only if this is the last remaining thread. */
+ if (get_nr_threads(current) == 1) {
+ /* Show the original registers in the dump. */
+ syscall_rollback(current, task_pt_regs(current));
+ /* Trigger a manual coredump since do_exit skips it. */
+ seccomp_init_siginfo(&info, this_syscall, data);
+ do_coredump(&info);
+ }
do_exit(SIGSYS);
}
+ }
unreachable();
diff --git a/kernel/signal.c b/kernel/signal.c
index 13f9def8b24a..7e59ebc2c25e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -13,7 +13,12 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/init.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/user.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
+#include <linux/sched/cputime.h>
#include <linux/fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
@@ -2395,11 +2400,11 @@ void exit_signals(struct task_struct *tsk)
* @tsk is about to have PF_EXITING set - lock out users which
* expect stable threadgroup.
*/
- threadgroup_change_begin(tsk);
+ cgroup_threadgroup_change_begin(tsk);
if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
tsk->flags |= PF_EXITING;
- threadgroup_change_end(tsk);
+ cgroup_threadgroup_change_end(tsk);
return;
}
@@ -2410,7 +2415,7 @@ void exit_signals(struct task_struct *tsk)
*/
tsk->flags |= PF_EXITING;
- threadgroup_change_end(tsk);
+ cgroup_threadgroup_change_end(tsk);
if (!signal_pending(tsk))
goto out;
@@ -3239,10 +3244,17 @@ int compat_restore_altstack(const compat_stack_t __user *uss)
int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
{
+ int err;
struct task_struct *t = current;
- return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
- __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+ err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
+ &uss->ss_sp) |
+ __put_user(t->sas_ss_flags, &uss->ss_flags) |
__put_user(t->sas_ss_size, &uss->ss_size);
+ if (err)
+ return err;
+ if (t->sas_ss_flags & SS_AUTODISARM)
+ sas_ss_reset(t);
+ return 0;
}
#endif
diff --git a/kernel/smp.c b/kernel/smp.c
index 77fcdb9f2775..a817769b53c0 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -17,6 +17,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/sched/idle.h>
#include <linux/hypervisor.h>
#include "smpboot.h"
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 4a5c6e73ecd4..1d71c051a951 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -9,6 +9,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 7d4a9a6df956..7ff6d1b10cec 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -49,6 +49,13 @@
#include <linux/binfmts.h>
#include <linux/sched.h>
+#include <linux/sched/autogroup.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/coredump.h>
+#include <linux/sched/task.h>
+#include <linux/sched/cputime.h>
#include <linux/rcupdate.h>
#include <linux/uidgid.h>
#include <linux/cred.h>
@@ -2063,6 +2070,24 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
}
#endif
+static int propagate_has_child_subreaper(struct task_struct *p, void *data)
+{
+ /*
+ * If task has has_child_subreaper - all its decendants
+ * already have these flag too and new decendants will
+ * inherit it on fork, skip them.
+ *
+ * If we've found child_reaper - skip descendants in
+ * it's subtree as they will never get out pidns.
+ */
+ if (p->signal->has_child_subreaper ||
+ is_child_reaper(task_pid(p)))
+ return 0;
+
+ p->signal->has_child_subreaper = 1;
+ return 1;
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2214,6 +2239,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
break;
case PR_SET_CHILD_SUBREAPER:
me->signal->is_child_subreaper = !!arg2;
+ if (!arg2)
+ break;
+
+ walk_process_tree(me, propagate_has_child_subreaper, NULL);
break;
case PR_GET_CHILD_SUBREAPER:
error = put_user(me->signal->is_child_subreaper,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bb260ceb3718..8c8714fcb53c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -63,6 +63,7 @@
#include <linux/capability.h>
#include <linux/binfmts.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/coredump.h>
#include <linux/kexec.h>
#include <linux/bpf.h>
#include <linux/mount.h>
@@ -2132,9 +2133,12 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
if (write) {
if (*negp)
return -EINVAL;
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
*valp = *lvalp;
} else {
unsigned int val = *valp;
+ *negp = false;
*lvalp = (unsigned long)val;
}
return 0;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e6dc9a538efa..ce3a31e8eb36 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -19,6 +19,8 @@
#include <linux/hrtimer.h>
#include <linux/timerqueue.h>
#include <linux/rtc.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
#include <linux/alarmtimer.h>
#include <linux/mutex.h>
#include <linux/platform_device.h>
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 8e11d8d9f419..ec08f527d7ee 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -43,10 +43,12 @@
#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/debug.h>
#include <linux/timer.h>
#include <linux/freezer.h>
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index a95f13c31464..087d6a1279b8 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -10,6 +10,8 @@
#include <linux/interrupt.h>
#include <linux/syscalls.h>
#include <linux/time.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
#include <linux/posix-timers.h>
#include <linux/hrtimer.h>
#include <trace/events/timer.h>
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 7906b3f0c41a..497719127bf9 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -125,7 +125,7 @@ int register_refined_jiffies(long cycles_per_second)
shift_hz += cycles_per_tick/2;
do_div(shift_hz, cycles_per_tick);
/* Calculate nsec_per_tick using shift_hz */
- nsec_per_tick = (u64)TICK_NSEC << 8;
+ nsec_per_tick = (u64)NSEC_PER_SEC << 8;
nsec_per_tick += (u32)shift_hz/2;
do_div(nsec_per_tick, (u32)shift_hz);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index b4377a5e4269..4513ad16a253 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -2,7 +2,8 @@
* Implement CPU time clocks for the POSIX clock interface.
*/
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
#include <linux/posix-timers.h>
#include <linux/errno.h>
#include <linux/math64.h>
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 1e6623d76750..50a6a47020de 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -35,6 +35,7 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/mutex.h>
+#include <linux/sched/task.h>
#include <linux/uaccess.h>
#include <linux/list.h>
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index a26036d37a38..ea6b610c4c57 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -13,6 +13,7 @@
#include <linux/kernel.h>
#include <linux/moduleparam.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/syscore_ops.h>
#include <linux/hrtimer.h>
#include <linux/sched_clock.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2c115fdab397..7fe53be86077 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -17,8 +17,12 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/percpu.h>
+#include <linux/nmi.h>
#include <linux/profile.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/nohz.h>
#include <linux/module.h>
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 95b258dd75db..5b63a2102c29 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -14,7 +14,9 @@
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
+#include <linux/nmi.h>
#include <linux/sched.h>
+#include <linux/sched/loadavg.h>
#include <linux/syscore_ops.h>
#include <linux/clocksource.h>
#include <linux/jiffies.h>
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 82a6bfa0c307..1dc0256bfb6e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -38,8 +38,10 @@
#include <linux/tick.h>
#include <linux/kallsyms.h>
#include <linux/irq_work.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
diff --git a/kernel/torture.c b/kernel/torture.c
index 0d887eb62856..55de96529287 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -30,6 +30,7 @@
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/completion.h>
@@ -311,7 +312,7 @@ EXPORT_SYMBOL_GPL(torture_random);
/*
* Variables for shuffling. The idea is to ensure that each CPU stays
* idle for an extended period to test interactions with dyntick idle,
- * as well as interactions with any per-CPU varibles.
+ * as well as interactions with any per-CPU variables.
*/
struct shuffle_task {
struct list_head st_l;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d5038005eb5d..d4a06e714645 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -429,7 +429,7 @@ config BLK_DEV_IO_TRACE
If unsure, say N.
-config KPROBE_EVENT
+config KPROBE_EVENTS
depends on KPROBES
depends on HAVE_REGS_AND_STACK_ACCESS_API
bool "Enable kprobes-based dynamic events"
@@ -447,7 +447,7 @@ config KPROBE_EVENT
This option is also required by perf-probe subcommand of perf tools.
If you want to use perf tools, this option is strongly recommended.
-config UPROBE_EVENT
+config UPROBE_EVENTS
bool "Enable uprobes-based dynamic events"
depends on ARCH_SUPPORTS_UPROBES
depends on MMU
@@ -466,7 +466,7 @@ config UPROBE_EVENT
config BPF_EVENTS
depends on BPF_SYSCALL
- depends on (KPROBE_EVENT || UPROBE_EVENT) && PERF_EVENTS
+ depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS
bool
default y
help
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index e57980845549..90f2701d92a7 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -57,7 +57,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
-obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
+obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
ifeq ($(CONFIG_PM),y)
obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
@@ -66,7 +66,7 @@ ifeq ($(CONFIG_TRACING),y)
obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
-obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
+obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 95cecbf67f5c..b2058a7f94bd 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -28,6 +28,8 @@
#include <linux/uaccess.h>
#include <linux/list.h>
+#include "../../block/blk.h"
+
#include <trace/events/block.h>
#include "trace_output.h"
@@ -292,9 +294,6 @@ record_it:
local_irq_restore(flags);
}
-static struct dentry *blk_tree_root;
-static DEFINE_MUTEX(blk_tree_mutex);
-
static void blk_trace_free(struct blk_trace *bt)
{
debugfs_remove(bt->msg_file);
@@ -433,9 +432,9 @@ static void blk_trace_setup_lba(struct blk_trace *bt,
/*
* Setup everything required to start tracing
*/
-int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- struct blk_user_trace_setup *buts)
+static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ struct blk_user_trace_setup *buts)
{
struct blk_trace *bt = NULL;
struct dentry *dir = NULL;
@@ -468,22 +467,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = -ENOENT;
- mutex_lock(&blk_tree_mutex);
- if (!blk_tree_root) {
- blk_tree_root = debugfs_create_dir("block", NULL);
- if (!blk_tree_root) {
- mutex_unlock(&blk_tree_mutex);
- goto err;
- }
- }
- mutex_unlock(&blk_tree_mutex);
-
- dir = debugfs_create_dir(buts->name, blk_tree_root);
+ if (!blk_debugfs_root)
+ goto err;
+ dir = debugfs_lookup(buts->name, blk_debugfs_root);
+ if (!dir)
+ bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
if (!dir)
goto err;
- bt->dir = dir;
bt->dev = dev;
atomic_set(&bt->dropped, 0);
INIT_LIST_HEAD(&bt->running_list);
@@ -525,9 +517,12 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (atomic_inc_return(&blk_probes_ref) == 1)
blk_register_tracepoints();
- return 0;
+ ret = 0;
err:
- blk_trace_free(bt);
+ if (dir && !bt->dir)
+ dput(dir);
+ if (ret)
+ blk_trace_free(bt);
return ret;
}
@@ -712,15 +707,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
if (likely(!bt))
return;
- if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
+ if (blk_rq_is_passthrough(rq))
what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, nr_bytes, req_op(rq), rq->cmd_flags,
- what, rq->errors, rq->cmd_len, rq->cmd);
- } else {
+ else
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, rq->errors, 0, NULL);
- }
+
+ __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
+ rq->cmd_flags, what, rq->errors, 0, NULL);
}
static void blk_add_trace_rq_abort(void *ignore,
@@ -972,11 +965,7 @@ void blk_add_driver_data(struct request_queue *q,
if (likely(!bt))
return;
- if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
- __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, 0,
- BLK_TA_DRV_DATA, rq->errors, len, data);
- else
- __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, 0,
+ __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1752,31 +1741,6 @@ void blk_trace_remove_sysfs(struct device *dev)
#ifdef CONFIG_EVENT_TRACING
-void blk_dump_cmd(char *buf, struct request *rq)
-{
- int i, end;
- int len = rq->cmd_len;
- unsigned char *cmd = rq->cmd;
-
- if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
- buf[0] = '\0';
- return;
- }
-
- for (end = len - 1; end >= 0; end--)
- if (cmd[end])
- break;
- end++;
-
- for (i = 0; i < len; i++) {
- buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
- if (i == end && end != len - 1) {
- sprintf(buf, " ..");
- break;
- }
- }
-}
-
void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
{
int i = 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index fa77311dadb2..cee9802cf3e0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -76,8 +76,8 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.func = bpf_probe_read,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_RAW_STACK,
- .arg2_type = ARG_CONST_STACK_SIZE,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
};
@@ -109,8 +109,8 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
- .arg2_type = ARG_PTR_TO_STACK,
- .arg3_type = ARG_CONST_STACK_SIZE,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
};
static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
@@ -213,8 +213,8 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
.func = bpf_trace_printk,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_STACK,
- .arg2_type = ARG_CONST_STACK_SIZE,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE,
};
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
@@ -329,8 +329,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_STACK,
- .arg5_type = ARG_CONST_STACK_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
};
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
@@ -395,6 +395,36 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size,
+ const void *, unsafe_ptr)
+{
+ int ret;
+
+ /*
+ * The strncpy_from_unsafe() call will likely not fill the entire
+ * buffer, but that's okay in this circumstance as we're probing
+ * arbitrary memory anyway similar to bpf_probe_read() and might
+ * as well probe the stack. Thus, memory is explicitly cleared
+ * only in error case, so that improper users ignoring return
+ * code altogether don't copy garbage; otherwise length of string
+ * is returned that can be used for bpf_perf_event_output() et al.
+ */
+ ret = strncpy_from_unsafe(dst, unsafe_ptr, size);
+ if (unlikely(ret < 0))
+ memset(dst, 0, size);
+
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_probe_read_str_proto = {
+ .func = bpf_probe_read_str,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -432,6 +462,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
return &bpf_current_task_under_cgroup_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
+ case BPF_FUNC_probe_read_str:
+ return &bpf_probe_read_str_proto;
default:
return NULL;
}
@@ -459,6 +491,13 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
return false;
if (off % size != 0)
return false;
+ /*
+ * Assertion for 32 bit to make sure last 8 byte access
+ * (BPF_DW) to the last 4 byte member is disallowed.
+ */
+ if (off + size > sizeof(struct pt_regs))
+ return false;
+
return true;
}
@@ -467,7 +506,7 @@ static const struct bpf_verifier_ops kprobe_prog_ops = {
.is_valid_access = kprobe_prog_is_valid_access,
};
-static struct bpf_prog_type_list kprobe_tl = {
+static struct bpf_prog_type_list kprobe_tl __ro_after_init = {
.ops = &kprobe_prog_ops,
.type = BPF_PROG_TYPE_KPROBE,
};
@@ -492,8 +531,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_STACK,
- .arg5_type = ARG_CONST_STACK_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM,
+ .arg5_type = ARG_CONST_SIZE,
};
BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
@@ -540,6 +579,8 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
return false;
if (off % size != 0)
return false;
+
+ BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
return true;
}
@@ -548,7 +589,7 @@ static const struct bpf_verifier_ops tracepoint_prog_ops = {
.is_valid_access = tp_prog_is_valid_access,
};
-static struct bpf_prog_type_list tracepoint_tl = {
+static struct bpf_prog_type_list tracepoint_tl __ro_after_init = {
.ops = &tracepoint_prog_ops,
.type = BPF_PROG_TYPE_TRACEPOINT,
};
@@ -572,28 +613,29 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
return true;
}
-static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg,
- int src_reg, int ctx_off,
+static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog)
{
struct bpf_insn *insn = insn_buf;
- switch (ctx_off) {
+ switch (si->off) {
case offsetof(struct bpf_perf_event_data, sample_period):
BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
- data), dst_reg, src_reg,
+ data), si->dst_reg, si->src_reg,
offsetof(struct bpf_perf_event_data_kern, data));
- *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg,
+ *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
offsetof(struct perf_sample_data, period));
break;
default:
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
- regs), dst_reg, src_reg,
+ regs), si->dst_reg, si->src_reg,
offsetof(struct bpf_perf_event_data_kern, regs));
- *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg,
+ si->off);
break;
}
@@ -606,7 +648,7 @@ static const struct bpf_verifier_ops perf_event_prog_ops = {
.convert_ctx_access = pe_prog_convert_ctx_access,
};
-static struct bpf_prog_type_list perf_event_tl = {
+static struct bpf_prog_type_list perf_event_tl __ro_after_init = {
.ops = &perf_event_prog_ops,
.type = BPF_PROG_TYPE_PERF_EVENT,
};
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eb230f06ba41..dd3e91d68dc7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -15,6 +15,7 @@
#include <linux/stop_machine.h>
#include <linux/clocksource.h>
+#include <linux/sched/task.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/suspend.h>
@@ -1110,13 +1111,6 @@ struct ftrace_func_entry {
unsigned long ip;
};
-struct ftrace_hash {
- unsigned long size_bits;
- struct hlist_head *buckets;
- unsigned long count;
- struct rcu_head rcu;
-};
-
/*
* We make these constant because no one should touch them,
* but they are used as the default "empty hash", to avoid allocating
@@ -1192,26 +1186,24 @@ struct ftrace_page {
static struct ftrace_page *ftrace_pages_start;
static struct ftrace_page *ftrace_pages;
-static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
+static __always_inline unsigned long
+ftrace_hash_key(struct ftrace_hash *hash, unsigned long ip)
{
- return !hash || !hash->count;
+ if (hash->size_bits > 0)
+ return hash_long(ip, hash->size_bits);
+
+ return 0;
}
-static struct ftrace_func_entry *
-ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+/* Only use this function if ftrace_hash_empty() has already been tested */
+static __always_inline struct ftrace_func_entry *
+__ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
{
unsigned long key;
struct ftrace_func_entry *entry;
struct hlist_head *hhd;
- if (ftrace_hash_empty(hash))
- return NULL;
-
- if (hash->size_bits > 0)
- key = hash_long(ip, hash->size_bits);
- else
- key = 0;
-
+ key = ftrace_hash_key(hash, ip);
hhd = &hash->buckets[key];
hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) {
@@ -1221,17 +1213,32 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
return NULL;
}
+/**
+ * ftrace_lookup_ip - Test to see if an ip exists in an ftrace_hash
+ * @hash: The hash to look at
+ * @ip: The instruction pointer to test
+ *
+ * Search a given @hash to see if a given instruction pointer (@ip)
+ * exists in it.
+ *
+ * Returns the entry that holds the @ip if found. NULL otherwise.
+ */
+struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+{
+ if (ftrace_hash_empty(hash))
+ return NULL;
+
+ return __ftrace_lookup_ip(hash, ip);
+}
+
static void __add_hash_entry(struct ftrace_hash *hash,
struct ftrace_func_entry *entry)
{
struct hlist_head *hhd;
unsigned long key;
- if (hash->size_bits)
- key = hash_long(entry->ip, hash->size_bits);
- else
- key = 0;
-
+ key = ftrace_hash_key(hash, entry->ip);
hhd = &hash->buckets[key];
hlist_add_head(&entry->hlist, hhd);
hash->count++;
@@ -1383,9 +1390,8 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
struct ftrace_hash *new_hash);
-static int
-ftrace_hash_move(struct ftrace_ops *ops, int enable,
- struct ftrace_hash **dst, struct ftrace_hash *src)
+static struct ftrace_hash *
+__ftrace_hash_move(struct ftrace_hash *src)
{
struct ftrace_func_entry *entry;
struct hlist_node *tn;
@@ -1393,21 +1399,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash *new_hash;
int size = src->count;
int bits = 0;
- int ret;
int i;
- /* Reject setting notrace hash on IPMODIFY ftrace_ops */
- if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
- return -EINVAL;
-
/*
- * If the new source is empty, just free dst and assign it
- * the empty_hash.
+ * If the new source is empty, just return the empty_hash.
*/
- if (!src->count) {
- new_hash = EMPTY_HASH;
- goto update;
- }
+ if (!src->count)
+ return EMPTY_HASH;
/*
* Make the hash size about 1/2 the # found
@@ -1421,7 +1419,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
new_hash = alloc_ftrace_hash(bits);
if (!new_hash)
- return -ENOMEM;
+ return NULL;
size = 1 << src->size_bits;
for (i = 0; i < size; i++) {
@@ -1432,7 +1430,24 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
}
}
-update:
+ return new_hash;
+}
+
+static int
+ftrace_hash_move(struct ftrace_ops *ops, int enable,
+ struct ftrace_hash **dst, struct ftrace_hash *src)
+{
+ struct ftrace_hash *new_hash;
+ int ret;
+
+ /* Reject setting notrace hash on IPMODIFY ftrace_ops */
+ if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
+ return -EINVAL;
+
+ new_hash = __ftrace_hash_move(src);
+ if (!new_hash)
+ return -ENOMEM;
+
/* Make sure this can be applied if it is IPMODIFY ftrace_ops */
if (enable) {
/* IPMODIFY should be updated only when filter_hash updating */
@@ -1466,9 +1481,9 @@ static bool hash_contains_ip(unsigned long ip,
* notrace hash is considered not in the notrace hash.
*/
return (ftrace_hash_empty(hash->filter_hash) ||
- ftrace_lookup_ip(hash->filter_hash, ip)) &&
+ __ftrace_lookup_ip(hash->filter_hash, ip)) &&
(ftrace_hash_empty(hash->notrace_hash) ||
- !ftrace_lookup_ip(hash->notrace_hash, ip));
+ !__ftrace_lookup_ip(hash->notrace_hash, ip));
}
/*
@@ -2880,7 +2895,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
/* The function must be in the filter */
if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
- !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
+ !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
return 0;
/* If in notrace hash, we ignore it too */
@@ -3740,23 +3755,24 @@ static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
ftrace_probe_registered = 1;
}
-static void __disable_ftrace_function_probe(void)
+static bool __disable_ftrace_function_probe(void)
{
int i;
if (!ftrace_probe_registered)
- return;
+ return false;
for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
struct hlist_head *hhd = &ftrace_func_hash[i];
if (hhd->first)
- return;
+ return false;
}
/* no more funcs left */
ftrace_shutdown(&trace_probe_ops, 0);
ftrace_probe_registered = 0;
+ return true;
}
@@ -3886,6 +3902,7 @@ static void
__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
void *data, int flags)
{
+ struct ftrace_ops_hash old_hash_ops;
struct ftrace_func_entry *rec_entry;
struct ftrace_func_probe *entry;
struct ftrace_func_probe *p;
@@ -3897,6 +3914,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
struct hlist_node *tmp;
char str[KSYM_SYMBOL_LEN];
int i, ret;
+ bool disabled;
if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
func_g.search = NULL;
@@ -3915,6 +3933,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
mutex_lock(&trace_probe_ops.func_hash->regex_lock);
+ old_hash_ops.filter_hash = old_hash;
+ /* Probes only have filters */
+ old_hash_ops.notrace_hash = NULL;
+
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
if (!hash)
/* Hmm, should report this somehow */
@@ -3952,12 +3974,17 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
}
}
mutex_lock(&ftrace_lock);
- __disable_ftrace_function_probe();
+ disabled = __disable_ftrace_function_probe();
/*
* Remove after the disable is called. Otherwise, if the last
* probe is removed, a null hash means *all enabled*.
*/
ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+
+ /* still need to update the function call sites */
+ if (ftrace_enabled && !disabled)
+ ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
+ &old_hash_ops);
synchronize_sched();
if (!ret)
free_ftrace_hash_rcu(old_hash);
@@ -4382,7 +4409,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
-static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
+static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
static unsigned long save_global_trampoline;
static unsigned long save_global_flags;
@@ -4401,26 +4428,38 @@ static int __init set_graph_notrace_function(char *str)
}
__setup("ftrace_graph_notrace=", set_graph_notrace_function);
+static int __init set_graph_max_depth_function(char *str)
+{
+ if (!str)
+ return 0;
+ fgraph_max_depth = simple_strtoul(str, NULL, 0);
+ return 1;
+}
+__setup("ftrace_graph_max_depth=", set_graph_max_depth_function);
+
static void __init set_ftrace_early_graph(char *buf, int enable)
{
int ret;
char *func;
- unsigned long *table = ftrace_graph_funcs;
- int *count = &ftrace_graph_count;
+ struct ftrace_hash *hash;
- if (!enable) {
- table = ftrace_graph_notrace_funcs;
- count = &ftrace_graph_notrace_count;
- }
+ hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+ if (WARN_ON(!hash))
+ return;
while (buf) {
func = strsep(&buf, ",");
/* we allow only one expression at a time */
- ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);
+ ret = ftrace_graph_set_hash(hash, func);
if (ret)
printk(KERN_DEBUG "ftrace: function %s not "
"traceable\n", func);
}
+
+ if (enable)
+ ftrace_graph_hash = hash;
+ else
+ ftrace_graph_notrace_hash = hash;
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -4540,26 +4579,55 @@ static const struct file_operations ftrace_notrace_fops = {
static DEFINE_MUTEX(graph_lock);
-int ftrace_graph_count;
-int ftrace_graph_notrace_count;
-unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
-unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+struct ftrace_hash *ftrace_graph_hash = EMPTY_HASH;
+struct ftrace_hash *ftrace_graph_notrace_hash = EMPTY_HASH;
+
+enum graph_filter_type {
+ GRAPH_FILTER_NOTRACE = 0,
+ GRAPH_FILTER_FUNCTION,
+};
+
+#define FTRACE_GRAPH_EMPTY ((void *)1)
struct ftrace_graph_data {
- unsigned long *table;
- size_t size;
- int *count;
- const struct seq_operations *seq_ops;
+ struct ftrace_hash *hash;
+ struct ftrace_func_entry *entry;
+ int idx; /* for hash table iteration */
+ enum graph_filter_type type;
+ struct ftrace_hash *new_hash;
+ const struct seq_operations *seq_ops;
+ struct trace_parser parser;
};
static void *
__g_next(struct seq_file *m, loff_t *pos)
{
struct ftrace_graph_data *fgd = m->private;
+ struct ftrace_func_entry *entry = fgd->entry;
+ struct hlist_head *head;
+ int i, idx = fgd->idx;
- if (*pos >= *fgd->count)
+ if (*pos >= fgd->hash->count)
return NULL;
- return &fgd->table[*pos];
+
+ if (entry) {
+ hlist_for_each_entry_continue(entry, hlist) {
+ fgd->entry = entry;
+ return entry;
+ }
+
+ idx++;
+ }
+
+ for (i = idx; i < 1 << fgd->hash->size_bits; i++) {
+ head = &fgd->hash->buckets[i];
+ hlist_for_each_entry(entry, head, hlist) {
+ fgd->entry = entry;
+ fgd->idx = i;
+ return entry;
+ }
+ }
+ return NULL;
}
static void *
@@ -4575,10 +4643,19 @@ static void *g_start(struct seq_file *m, loff_t *pos)
mutex_lock(&graph_lock);
+ if (fgd->type == GRAPH_FILTER_FUNCTION)
+ fgd->hash = rcu_dereference_protected(ftrace_graph_hash,
+ lockdep_is_held(&graph_lock));
+ else
+ fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+ lockdep_is_held(&graph_lock));
+
/* Nothing, tell g_show to print all functions are enabled */
- if (!*fgd->count && !*pos)
- return (void *)1;
+ if (ftrace_hash_empty(fgd->hash) && !*pos)
+ return FTRACE_GRAPH_EMPTY;
+ fgd->idx = 0;
+ fgd->entry = NULL;
return __g_next(m, pos);
}
@@ -4589,22 +4666,22 @@ static void g_stop(struct seq_file *m, void *p)
static int g_show(struct seq_file *m, void *v)
{
- unsigned long *ptr = v;
+ struct ftrace_func_entry *entry = v;
- if (!ptr)
+ if (!entry)
return 0;
- if (ptr == (unsigned long *)1) {
+ if (entry == FTRACE_GRAPH_EMPTY) {
struct ftrace_graph_data *fgd = m->private;
- if (fgd->table == ftrace_graph_funcs)
+ if (fgd->type == GRAPH_FILTER_FUNCTION)
seq_puts(m, "#### all functions enabled ####\n");
else
seq_puts(m, "#### no functions disabled ####\n");
return 0;
}
- seq_printf(m, "%ps\n", (void *)*ptr);
+ seq_printf(m, "%ps\n", (void *)entry->ip);
return 0;
}
@@ -4621,24 +4698,51 @@ __ftrace_graph_open(struct inode *inode, struct file *file,
struct ftrace_graph_data *fgd)
{
int ret = 0;
+ struct ftrace_hash *new_hash = NULL;
- mutex_lock(&graph_lock);
- if ((file->f_mode & FMODE_WRITE) &&
- (file->f_flags & O_TRUNC)) {
- *fgd->count = 0;
- memset(fgd->table, 0, fgd->size * sizeof(*fgd->table));
+ if (file->f_mode & FMODE_WRITE) {
+ const int size_bits = FTRACE_HASH_DEFAULT_BITS;
+
+ if (trace_parser_get_init(&fgd->parser, FTRACE_BUFF_MAX))
+ return -ENOMEM;
+
+ if (file->f_flags & O_TRUNC)
+ new_hash = alloc_ftrace_hash(size_bits);
+ else
+ new_hash = alloc_and_copy_ftrace_hash(size_bits,
+ fgd->hash);
+ if (!new_hash) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
- mutex_unlock(&graph_lock);
if (file->f_mode & FMODE_READ) {
- ret = seq_open(file, fgd->seq_ops);
+ ret = seq_open(file, &ftrace_graph_seq_ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = fgd;
+ } else {
+ /* Failed */
+ free_ftrace_hash(new_hash);
+ new_hash = NULL;
}
} else
file->private_data = fgd;
+out:
+ if (ret < 0 && file->f_mode & FMODE_WRITE)
+ trace_parser_put(&fgd->parser);
+
+ fgd->new_hash = new_hash;
+
+ /*
+ * All uses of fgd->hash must be taken with the graph_lock
+ * held. The graph_lock is going to be released, so force
+ * fgd->hash to be reinitialized when it is taken again.
+ */
+ fgd->hash = NULL;
+
return ret;
}
@@ -4646,6 +4750,7 @@ static int
ftrace_graph_open(struct inode *inode, struct file *file)
{
struct ftrace_graph_data *fgd;
+ int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
@@ -4654,18 +4759,26 @@ ftrace_graph_open(struct inode *inode, struct file *file)
if (fgd == NULL)
return -ENOMEM;
- fgd->table = ftrace_graph_funcs;
- fgd->size = FTRACE_GRAPH_MAX_FUNCS;
- fgd->count = &ftrace_graph_count;
+ mutex_lock(&graph_lock);
+
+ fgd->hash = rcu_dereference_protected(ftrace_graph_hash,
+ lockdep_is_held(&graph_lock));
+ fgd->type = GRAPH_FILTER_FUNCTION;
fgd->seq_ops = &ftrace_graph_seq_ops;
- return __ftrace_graph_open(inode, file, fgd);
+ ret = __ftrace_graph_open(inode, file, fgd);
+ if (ret < 0)
+ kfree(fgd);
+
+ mutex_unlock(&graph_lock);
+ return ret;
}
static int
ftrace_graph_notrace_open(struct inode *inode, struct file *file)
{
struct ftrace_graph_data *fgd;
+ int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
@@ -4674,45 +4787,97 @@ ftrace_graph_notrace_open(struct inode *inode, struct file *file)
if (fgd == NULL)
return -ENOMEM;
- fgd->table = ftrace_graph_notrace_funcs;
- fgd->size = FTRACE_GRAPH_MAX_FUNCS;
- fgd->count = &ftrace_graph_notrace_count;
+ mutex_lock(&graph_lock);
+
+ fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+ lockdep_is_held(&graph_lock));
+ fgd->type = GRAPH_FILTER_NOTRACE;
fgd->seq_ops = &ftrace_graph_seq_ops;
- return __ftrace_graph_open(inode, file, fgd);
+ ret = __ftrace_graph_open(inode, file, fgd);
+ if (ret < 0)
+ kfree(fgd);
+
+ mutex_unlock(&graph_lock);
+ return ret;
}
static int
ftrace_graph_release(struct inode *inode, struct file *file)
{
+ struct ftrace_graph_data *fgd;
+ struct ftrace_hash *old_hash, *new_hash;
+ struct trace_parser *parser;
+ int ret = 0;
+
if (file->f_mode & FMODE_READ) {
struct seq_file *m = file->private_data;
- kfree(m->private);
+ fgd = m->private;
seq_release(inode, file);
} else {
- kfree(file->private_data);
+ fgd = file->private_data;
}
- return 0;
+
+ if (file->f_mode & FMODE_WRITE) {
+
+ parser = &fgd->parser;
+
+ if (trace_parser_loaded((parser))) {
+ parser->buffer[parser->idx] = 0;
+ ret = ftrace_graph_set_hash(fgd->new_hash,
+ parser->buffer);
+ }
+
+ trace_parser_put(parser);
+
+ new_hash = __ftrace_hash_move(fgd->new_hash);
+ if (!new_hash) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_lock(&graph_lock);
+
+ if (fgd->type == GRAPH_FILTER_FUNCTION) {
+ old_hash = rcu_dereference_protected(ftrace_graph_hash,
+ lockdep_is_held(&graph_lock));
+ rcu_assign_pointer(ftrace_graph_hash, new_hash);
+ } else {
+ old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash,
+ lockdep_is_held(&graph_lock));
+ rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash);
+ }
+
+ mutex_unlock(&graph_lock);
+
+ /* Wait till all users are no longer using the old hash */
+ synchronize_sched();
+
+ free_ftrace_hash(old_hash);
+ }
+
+ out:
+ kfree(fgd->new_hash);
+ kfree(fgd);
+
+ return ret;
}
static int
-ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
+ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
{
struct ftrace_glob func_g;
struct dyn_ftrace *rec;
struct ftrace_page *pg;
+ struct ftrace_func_entry *entry;
int fail = 1;
int not;
- bool exists;
- int i;
/* decode regex */
func_g.type = filter_parse_regex(buffer, strlen(buffer),
&func_g.search, &not);
- if (!not && *idx >= size)
- return -EBUSY;
func_g.len = strlen(func_g.search);
@@ -4729,26 +4894,18 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
continue;
if (ftrace_match_record(rec, &func_g, NULL, 0)) {
- /* if it is in the array */
- exists = false;
- for (i = 0; i < *idx; i++) {
- if (array[i] == rec->ip) {
- exists = true;
- break;
- }
- }
+ entry = ftrace_lookup_ip(hash, rec->ip);
if (!not) {
fail = 0;
- if (!exists) {
- array[(*idx)++] = rec->ip;
- if (*idx >= size)
- goto out;
- }
+
+ if (entry)
+ continue;
+ if (add_hash_entry(hash, rec->ip) < 0)
+ goto out;
} else {
- if (exists) {
- array[i] = array[--(*idx)];
- array[*idx] = 0;
+ if (entry) {
+ free_hash_entry(hash, entry);
fail = 0;
}
}
@@ -4767,35 +4924,34 @@ static ssize_t
ftrace_graph_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct trace_parser parser;
ssize_t read, ret = 0;
struct ftrace_graph_data *fgd = file->private_data;
+ struct trace_parser *parser;
if (!cnt)
return 0;
- if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX))
- return -ENOMEM;
-
- read = trace_get_user(&parser, ubuf, cnt, ppos);
+ /* Read mode uses seq functions */
+ if (file->f_mode & FMODE_READ) {
+ struct seq_file *m = file->private_data;
+ fgd = m->private;
+ }
- if (read >= 0 && trace_parser_loaded((&parser))) {
- parser.buffer[parser.idx] = 0;
+ parser = &fgd->parser;
- mutex_lock(&graph_lock);
+ read = trace_get_user(parser, ubuf, cnt, ppos);
- /* we allow only one expression at a time */
- ret = ftrace_set_func(fgd->table, fgd->count, fgd->size,
- parser.buffer);
+ if (read >= 0 && trace_parser_loaded(parser) &&
+ !trace_parser_cont(parser)) {
- mutex_unlock(&graph_lock);
+ ret = ftrace_graph_set_hash(fgd->new_hash,
+ parser->buffer);
+ trace_parser_clear(parser);
}
if (!ret)
ret = read;
- trace_parser_put(&parser);
-
return ret;
}
@@ -5357,7 +5513,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
* Normally the mcount trampoline will call the ops->func, but there
* are times that it should not. For example, if the ops does not
* have its own recursion protection, then it should call the
- * ftrace_ops_recurs_func() instead.
+ * ftrace_ops_assist_func() instead.
*
* Returns the function that the trampoline should call for @ops.
*/
@@ -5410,6 +5566,15 @@ static void clear_ftrace_pids(struct trace_array *tr)
trace_free_pid_list(pid_list);
}
+void ftrace_clear_pids(struct trace_array *tr)
+{
+ mutex_lock(&ftrace_lock);
+
+ clear_ftrace_pids(tr);
+
+ mutex_unlock(&ftrace_lock);
+}
+
static void ftrace_pid_reset(struct trace_array *tr)
{
mutex_lock(&ftrace_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a85739efcc30..54e7a90db848 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -6,6 +6,7 @@
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
+#include <linux/sched/clock.h>
#include <linux/trace_seq.h>
#include <linux/spinlock.h>
#include <linux/irq_work.h>
@@ -4825,9 +4826,9 @@ static __init int test_ringbuffer(void)
rb_data[cpu].cnt = cpu;
rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
"rbtester/%d", cpu);
- if (WARN_ON(!rb_threads[cpu])) {
+ if (WARN_ON(IS_ERR(rb_threads[cpu]))) {
pr_cont("FAILED\n");
- ret = -1;
+ ret = PTR_ERR(rb_threads[cpu]);
goto out_free;
}
@@ -4837,9 +4838,9 @@ static __init int test_ringbuffer(void)
/* Now create the rb hammer! */
rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
- if (WARN_ON(!rb_hammer)) {
+ if (WARN_ON(IS_ERR(rb_hammer))) {
pr_cont("FAILED\n");
- ret = -1;
+ ret = PTR_ERR(rb_hammer);
goto out_free;
}
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 6df9a83e20d7..c190a4d5013c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -6,6 +6,7 @@
#include <linux/ring_buffer.h>
#include <linux/completion.h>
#include <linux/kthread.h>
+#include <uapi/linux/sched/types.h>
#include <linux/module.h>
#include <linux/ktime.h>
#include <asm/local.h>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7449783987a..d484452ae648 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -260,16 +260,8 @@ unsigned long long ns2usecs(u64 nsec)
TRACE_ITER_EVENT_FORK
/*
- * The global_trace is the descriptor that holds the tracing
- * buffers for the live tracing. For each CPU, it contains
- * a link list of pages that will store trace entries. The
- * page descriptor of the pages in the memory is used to hold
- * the link list by linking the lru item in the page descriptor
- * to each of the pages in the buffer per CPU.
- *
- * For each active CPU there is a data field that holds the
- * pages for the buffer for that CPU. Each CPU has the same number
- * of pages allocated for its buffer.
+ * The global_trace is the descriptor that holds the top-level tracing
+ * buffers for the live tracing.
*/
static struct trace_array global_trace = {
.trace_flags = TRACE_DEFAULT_FLAGS,
@@ -1193,6 +1185,7 @@ int trace_parser_get_init(struct trace_parser *parser, int size)
void trace_parser_put(struct trace_parser *parser)
{
kfree(parser->buffer);
+ parser->buffer = NULL;
}
/*
@@ -4348,22 +4341,22 @@ static const char readme_msg[] =
"\t\t\t traces\n"
#endif
#endif /* CONFIG_STACK_TRACER */
-#ifdef CONFIG_KPROBE_EVENT
+#ifdef CONFIG_KPROBE_EVENTS
" kprobe_events\t\t- Add/remove/show the kernel dynamic events\n"
"\t\t\t Write into this file to define/undefine new trace events.\n"
#endif
-#ifdef CONFIG_UPROBE_EVENT
+#ifdef CONFIG_UPROBE_EVENTS
" uprobe_events\t\t- Add/remove/show the userspace dynamic events\n"
"\t\t\t Write into this file to define/undefine new trace events.\n"
#endif
-#if defined(CONFIG_KPROBE_EVENT) || defined(CONFIG_UPROBE_EVENT)
+#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
"\t accepts: event-definitions (one definition per line)\n"
"\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n"
"\t -:[<group>/]<event>\n"
-#ifdef CONFIG_KPROBE_EVENT
+#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
#endif
-#ifdef CONFIG_UPROBE_EVENT
+#ifdef CONFIG_UPROBE_EVENTS
"\t place: <path>:<offset>\n"
#endif
"\t args: <name>=fetcharg[:type]\n"
@@ -7409,6 +7402,7 @@ static int instance_rmdir(const char *name)
tracing_set_nop(tr);
event_trace_del_tracer(tr);
+ ftrace_clear_pids(tr);
ftrace_destroy_function_files(tr);
tracefs_remove_recursive(tr->dir);
free_trace_buffers(tr);
@@ -7503,7 +7497,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
ftrace_init_tracefs(tr, d_tracer);
}
-static struct vfsmount *trace_automount(void *ingore)
+static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{
struct vfsmount *mnt;
struct file_system_type *type;
@@ -7516,7 +7510,7 @@ static struct vfsmount *trace_automount(void *ingore)
type = get_fs_type("tracefs");
if (!type)
return NULL;
- mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+ mnt = vfs_submount(mntpt, type, "tracefs", NULL);
put_filesystem(type);
if (IS_ERR(mnt))
return NULL;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1ea51ab53edf..d19d52d600d6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -753,6 +753,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
extern char trace_find_mark(unsigned long long duration);
+struct ftrace_hash {
+ unsigned long size_bits;
+ struct hlist_head *buckets;
+ unsigned long count;
+ struct rcu_head rcu;
+};
+
+struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);
+
+static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
+{
+ return !hash || !hash->count;
+}
+
/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -787,53 +802,50 @@ extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
unsigned long flags, int pc);
-
#ifdef CONFIG_DYNAMIC_FTRACE
-/* TODO: make this variable */
-#define FTRACE_GRAPH_MAX_FUNCS 32
-extern int ftrace_graph_count;
-extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
-extern int ftrace_graph_notrace_count;
-extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS];
+extern struct ftrace_hash *ftrace_graph_hash;
+extern struct ftrace_hash *ftrace_graph_notrace_hash;
static inline int ftrace_graph_addr(unsigned long addr)
{
- int i;
-
- if (!ftrace_graph_count)
- return 1;
-
- for (i = 0; i < ftrace_graph_count; i++) {
- if (addr == ftrace_graph_funcs[i]) {
- /*
- * If no irqs are to be traced, but a set_graph_function
- * is set, and called by an interrupt handler, we still
- * want to trace it.
- */
- if (in_irq())
- trace_recursion_set(TRACE_IRQ_BIT);
- else
- trace_recursion_clear(TRACE_IRQ_BIT);
- return 1;
- }
+ int ret = 0;
+
+ preempt_disable_notrace();
+
+ if (ftrace_hash_empty(ftrace_graph_hash)) {
+ ret = 1;
+ goto out;
}
- return 0;
+ if (ftrace_lookup_ip(ftrace_graph_hash, addr)) {
+ /*
+ * If no irqs are to be traced, but a set_graph_function
+ * is set, and called by an interrupt handler, we still
+ * want to trace it.
+ */
+ if (in_irq())
+ trace_recursion_set(TRACE_IRQ_BIT);
+ else
+ trace_recursion_clear(TRACE_IRQ_BIT);
+ ret = 1;
+ }
+
+out:
+ preempt_enable_notrace();
+ return ret;
}
static inline int ftrace_graph_notrace_addr(unsigned long addr)
{
- int i;
+ int ret = 0;
- if (!ftrace_graph_notrace_count)
- return 0;
+ preempt_disable_notrace();
- for (i = 0; i < ftrace_graph_notrace_count; i++) {
- if (addr == ftrace_graph_notrace_funcs[i])
- return 1;
- }
+ if (ftrace_lookup_ip(ftrace_graph_notrace_hash, addr))
+ ret = 1;
- return 0;
+ preempt_enable_notrace();
+ return ret;
}
#else
static inline int ftrace_graph_addr(unsigned long addr)
@@ -884,6 +896,7 @@ int using_ftrace_ops_list_func(void);
void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
void ftrace_init_tracefs_toplevel(struct trace_array *tr,
struct dentry *d_tracer);
+void ftrace_clear_pids(struct trace_array *tr);
#else
static inline int ftrace_trace_task(struct trace_array *tr)
{
@@ -902,6 +915,7 @@ ftrace_init_global_array_ops(struct trace_array *tr) { }
static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
+static inline void ftrace_clear_pids(struct trace_array *tr) { }
/* ftace_func_t type is not defined, use macro instead of static inline */
#define ftrace_init_array_ops(tr, func) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER */
@@ -1300,7 +1314,8 @@ static inline bool is_string_field(struct ftrace_event_field *field)
{
return field->filter_type == FILTER_DYN_STRING ||
field->filter_type == FILTER_STATIC_STRING ||
- field->filter_type == FILTER_PTR_STRING;
+ field->filter_type == FILTER_PTR_STRING ||
+ field->filter_type == FILTER_COMM;
}
static inline bool is_function_field(struct ftrace_event_field *field)
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index e3b488825ae3..e49fbe901cfc 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -175,9 +175,9 @@ int trace_benchmark_reg(void)
bm_event_thread = kthread_run(benchmark_event_kthread,
NULL, "event_benchmark");
- if (!bm_event_thread) {
+ if (IS_ERR(bm_event_thread)) {
pr_warning("trace benchmark failed to create kernel thread\n");
- return -ENOMEM;
+ return PTR_ERR(bm_event_thread);
}
return 0;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 75489de546b6..4d8fdf3184dc 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -27,7 +27,7 @@ static DEFINE_MUTEX(branch_tracing_mutex);
static struct trace_array *branch_tracer;
static void
-probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
struct trace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
@@ -68,16 +68,17 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
entry = ring_buffer_event_data(event);
/* Strip off the path, only save the file */
- p = f->file + strlen(f->file);
- while (p >= f->file && *p != '/')
+ p = f->data.file + strlen(f->data.file);
+ while (p >= f->data.file && *p != '/')
p--;
p++;
- strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+ strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE);
strncpy(entry->file, p, TRACE_FILE_SIZE);
entry->func[TRACE_FUNC_SIZE] = 0;
entry->file[TRACE_FILE_SIZE] = 0;
- entry->line = f->line;
+ entry->constant = f->constant;
+ entry->line = f->data.line;
entry->correct = val == expect;
if (!call_filter_check_discard(call, entry, buffer, event))
@@ -89,7 +90,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
}
static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
if (!branch_tracing_enabled)
return;
@@ -195,13 +196,19 @@ core_initcall(init_branch_tracer);
#else
static inline
-void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
{
}
#endif /* CONFIG_BRANCH_TRACER */
-void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
+void ftrace_likely_update(struct ftrace_likely_data *f, int val,
+ int expect, int is_constant)
{
+ /* A constant is always correct */
+ if (is_constant) {
+ f->constant++;
+ val = expect;
+ }
/*
* I would love to have a trace point here instead, but the
* trace point code is so inundated with unlikely and likely
@@ -212,9 +219,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
/* FIXME: Make this atomic! */
if (val == expect)
- f->correct++;
+ f->data.correct++;
else
- f->incorrect++;
+ f->data.incorrect++;
}
EXPORT_SYMBOL(ftrace_likely_update);
@@ -245,29 +252,60 @@ static inline long get_incorrect_percent(struct ftrace_branch_data *p)
return percent;
}
-static int branch_stat_show(struct seq_file *m, void *v)
+static const char *branch_stat_process_file(struct ftrace_branch_data *p)
{
- struct ftrace_branch_data *p = v;
const char *f;
- long percent;
/* Only print the file, not the path */
f = p->file + strlen(p->file);
while (f >= p->file && *f != '/')
f--;
- f++;
+ return ++f;
+}
+
+static void branch_stat_show(struct seq_file *m,
+ struct ftrace_branch_data *p, const char *f)
+{
+ long percent;
/*
* The miss is overlayed on correct, and hit on incorrect.
*/
percent = get_incorrect_percent(p);
- seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
if (percent < 0)
seq_puts(m, " X ");
else
seq_printf(m, "%3ld ", percent);
+
seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+}
+
+static int branch_stat_show_normal(struct seq_file *m,
+ struct ftrace_branch_data *p, const char *f)
+{
+ seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
+ branch_stat_show(m, p, f);
+ return 0;
+}
+
+static int annotate_branch_stat_show(struct seq_file *m, void *v)
+{
+ struct ftrace_likely_data *p = v;
+ const char *f;
+ int l;
+
+ f = branch_stat_process_file(&p->data);
+
+ if (!p->constant)
+ return branch_stat_show_normal(m, &p->data, f);
+
+ l = snprintf(NULL, 0, "/%lu", p->constant);
+ l = l > 8 ? 0 : 8 - l;
+
+ seq_printf(m, "%8lu/%lu %*lu ",
+ p->data.correct, p->constant, l, p->data.incorrect);
+ branch_stat_show(m, &p->data, f);
return 0;
}
@@ -279,7 +317,7 @@ static void *annotated_branch_stat_start(struct tracer_stat *trace)
static void *
annotated_branch_stat_next(void *v, int idx)
{
- struct ftrace_branch_data *p = v;
+ struct ftrace_likely_data *p = v;
++p;
@@ -328,7 +366,7 @@ static struct tracer_stat annotated_branch_stats = {
.stat_next = annotated_branch_stat_next,
.stat_cmp = annotated_branch_stat_cmp,
.stat_headers = annotated_branch_stat_headers,
- .stat_show = branch_stat_show
+ .stat_show = annotate_branch_stat_show
};
__init static int init_annotated_branch_stats(void)
@@ -379,12 +417,21 @@ all_branch_stat_next(void *v, int idx)
return p;
}
+static int all_branch_stat_show(struct seq_file *m, void *v)
+{
+ struct ftrace_branch_data *p = v;
+ const char *f;
+
+ f = branch_stat_process_file(p);
+ return branch_stat_show_normal(m, p, f);
+}
+
static struct tracer_stat all_branch_stats = {
.name = "branch_all",
.stat_start = all_branch_stat_start,
.stat_next = all_branch_stat_next,
.stat_headers = all_branch_stat_headers,
- .stat_show = branch_stat_show
+ .stat_show = all_branch_stat_show
};
__init static int all_annotated_branch_stats(void)
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 0f06532a755b..5fdc779f411d 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/ktime.h>
#include <linux/trace_clock.h>
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index eb7396b7e7c3..c203ac4df791 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -328,11 +328,13 @@ FTRACE_ENTRY(branch, trace_branch,
__array( char, func, TRACE_FUNC_SIZE+1 )
__array( char, file, TRACE_FILE_SIZE+1 )
__field( char, correct )
+ __field( char, constant )
),
- F_printk("%u:%s:%s (%u)",
+ F_printk("%u:%s:%s (%u)%s",
__entry->line,
- __entry->func, __entry->file, __entry->correct),
+ __entry->func, __entry->file, __entry->correct,
+ __entry->constant ? " CONSTANT" : ""),
FILTER_OTHER
);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f3a960ed75a1..1c21d0e2a145 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -19,6 +19,7 @@
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
+#include <linux/rculist.h>
#include "tracing_map.h"
#include "trace.h"
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 6721a1e89f39..f2ac9d44f6c4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -22,6 +22,7 @@
#include <linux/ctype.h>
#include <linux/mutex.h>
#include <linux/slab.h>
+#include <linux/rculist.h>
#include "trace.h"
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index af344a1bf0d0..21ea6ae77d93 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -44,6 +44,7 @@
#include <linux/uaccess.h>
#include <linux/cpumask.h>
#include <linux/delay.h>
+#include <linux/sched/clock.h>
#include "trace.h"
static struct trace_array *hwlat_trace;
@@ -266,24 +267,13 @@ out:
static struct cpumask save_cpumask;
static bool disable_migrate;
-static void move_to_next_cpu(bool initmask)
+static void move_to_next_cpu(void)
{
- static struct cpumask *current_mask;
+ struct cpumask *current_mask = &save_cpumask;
int next_cpu;
if (disable_migrate)
return;
-
- /* Just pick the first CPU on first iteration */
- if (initmask) {
- current_mask = &save_cpumask;
- get_online_cpus();
- cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
- put_online_cpus();
- next_cpu = cpumask_first(current_mask);
- goto set_affinity;
- }
-
/*
* If for some reason the user modifies the CPU affinity
* of this thread, than stop migrating for the duration
@@ -300,7 +290,6 @@ static void move_to_next_cpu(bool initmask)
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(current_mask);
- set_affinity:
if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
goto disable;
@@ -322,20 +311,15 @@ static void move_to_next_cpu(bool initmask)
* need to ensure nothing else might be running (and thus preempting).
* Obviously this should never be used in production environments.
*
- * Currently this runs on which ever CPU it was scheduled on, but most
- * real-world hardware latency situations occur across several CPUs,
- * but we might later generalize this if we find there are any actualy
- * systems with alternate SMI delivery or other hardware latencies.
+ * Executes one loop interaction on each CPU in tracing_cpumask sysfs file.
*/
static int kthread_fn(void *data)
{
u64 interval;
- bool initmask = true;
while (!kthread_should_stop()) {
- move_to_next_cpu(initmask);
- initmask = false;
+ move_to_next_cpu();
local_irq_disable();
get_sample();
@@ -366,13 +350,27 @@ static int kthread_fn(void *data)
*/
static int start_kthread(struct trace_array *tr)
{
+ struct cpumask *current_mask = &save_cpumask;
struct task_struct *kthread;
+ int next_cpu;
+
+ /* Just pick the first CPU on first iteration */
+ current_mask = &save_cpumask;
+ get_online_cpus();
+ cpumask_and(current_mask, cpu_online_mask, tracing_buffer_mask);
+ put_online_cpus();
+ next_cpu = cpumask_first(current_mask);
kthread = kthread_create(kthread_fn, NULL, "hwlatd");
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
return -ENOMEM;
}
+
+ cpumask_clear(current_mask);
+ cpumask_set_cpu(next_cpu, current_mask);
+ sched_setaffinity(kthread->pid, current_mask);
+
hwlat_kthread = kthread;
wake_up_process(kthread);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 7ad9e53ad174..5f688cc724f0 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -16,9 +16,11 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) "trace_kprobe: " fmt
#include <linux/module.h>
#include <linux/uaccess.h>
+#include <linux/rculist.h>
#include "trace_probe.h"
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5d33a7352919..02a4aeb22c47 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -8,6 +8,8 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/ftrace.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/mm.h>
#include "trace_output.h"
@@ -124,6 +126,44 @@ EXPORT_SYMBOL(trace_print_symbols_seq);
#if BITS_PER_LONG == 32
const char *
+trace_print_flags_seq_u64(struct trace_seq *p, const char *delim,
+ unsigned long long flags,
+ const struct trace_print_flags_u64 *flag_array)
+{
+ unsigned long long mask;
+ const char *str;
+ const char *ret = trace_seq_buffer_ptr(p);
+ int i, first = 1;
+
+ for (i = 0; flag_array[i].name && flags; i++) {
+
+ mask = flag_array[i].mask;
+ if ((flags & mask) != mask)
+ continue;
+
+ str = flag_array[i].name;
+ flags &= ~mask;
+ if (!first && delim)
+ trace_seq_puts(p, delim);
+ else
+ first = 0;
+ trace_seq_puts(p, str);
+ }
+
+ /* check for left over flags */
+ if (flags) {
+ if (!first && delim)
+ trace_seq_puts(p, delim);
+ trace_seq_printf(p, "0x%llx", flags);
+ }
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(trace_print_flags_seq_u64);
+
+const char *
trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
const struct trace_print_flags_u64 *symbol_array)
{
@@ -162,15 +202,27 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
}
EXPORT_SYMBOL_GPL(trace_print_bitmask_seq);
+/**
+ * trace_print_hex_seq - print buffer as hex sequence
+ * @p: trace seq struct to write to
+ * @buf: The buffer to print
+ * @buf_len: Length of @buf in bytes
+ * @concatenate: Print @buf as single hex string or with spacing
+ *
+ * Prints the passed buffer as a hex sequence either as a whole,
+ * single hex string if @concatenate is true or with spacing after
+ * each byte in case @concatenate is false.
+ */
const char *
-trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
+trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len,
+ bool concatenate)
{
int i;
const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; i < buf_len; i++)
- trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
-
+ trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ",
+ buf[i]);
trace_seq_putc(p, 0);
return ret;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8c0553d9afd3..52478f033f88 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -21,6 +21,7 @@
* Copyright (C) IBM Corporation, 2010-2011
* Author: Srikar Dronamraju
*/
+#define pr_fmt(fmt) "trace_probe: " fmt
#include "trace_probe.h"
@@ -647,7 +648,7 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos,
int (*createfn)(int, char **))
{
- char *kbuf, *tmp;
+ char *kbuf, *buf, *tmp;
int ret = 0;
size_t done = 0;
size_t size;
@@ -667,27 +668,38 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
goto out;
}
kbuf[size] = '\0';
- tmp = strchr(kbuf, '\n');
+ buf = kbuf;
+ do {
+ tmp = strchr(buf, '\n');
+ if (tmp) {
+ *tmp = '\0';
+ size = tmp - buf + 1;
+ } else {
+ size = strlen(buf);
+ if (done + size < count) {
+ if (buf != kbuf)
+ break;
+ /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
+ pr_warn("Line length is too long: Should be less than %d\n",
+ WRITE_BUFSIZE - 2);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ done += size;
- if (tmp) {
- *tmp = '\0';
- size = tmp - kbuf + 1;
- } else if (done + size < count) {
- pr_warn("Line length is too long: Should be less than %d\n",
- WRITE_BUFSIZE);
- ret = -EINVAL;
- goto out;
- }
- done += size;
- /* Remove comments */
- tmp = strchr(kbuf, '#');
+ /* Remove comments */
+ tmp = strchr(buf, '#');
- if (tmp)
- *tmp = '\0';
+ if (tmp)
+ *tmp = '\0';
- ret = traceprobe_command(kbuf, createfn);
- if (ret)
- goto out;
+ ret = traceprobe_command(buf, createfn);
+ if (ret)
+ goto out;
+ buf += size;
+
+ } while (done < count);
}
ret = done;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 0c0ae54d44c6..903273c93e61 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -248,7 +248,7 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \
#define FETCH_TYPE_STRING 0
#define FETCH_TYPE_STRSIZE 1
-#ifdef CONFIG_KPROBE_EVENT
+#ifdef CONFIG_KPROBE_EVENTS
struct symbol_cache;
unsigned long update_symbol_cache(struct symbol_cache *sc);
void free_symbol_cache(struct symbol_cache *sc);
@@ -278,7 +278,7 @@ alloc_symbol_cache(const char *sym, long offset)
{
return NULL;
}
-#endif /* CONFIG_KPROBE_EVENT */
+#endif /* CONFIG_KPROBE_EVENTS */
struct probe_arg {
struct fetch_param fetch;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b0f86ea77881..cb917cebae29 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,5 +1,6 @@
/* Include in trace.c */
+#include <uapi/linux/sched/types.h>
#include <linux/stringify.h>
#include <linux/kthread.h>
#include <linux/delay.h>
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2a1abbaca10e..5fb1f2c87e6b 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -2,6 +2,7 @@
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*
*/
+#include <linux/sched/task_stack.h>
#include <linux/stacktrace.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
@@ -64,7 +65,7 @@ void stack_trace_print(void)
}
/*
- * When arch-specific code overides this function, the following
+ * When arch-specific code overrides this function, the following
* data should be filled up, assuming stack_trace_max_lock is held to
* prevent concurrent updates.
* stack_trace_index[]
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 0913693caf6e..a7581fec9681 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -17,12 +17,14 @@
* Copyright (C) IBM Corporation, 2010-2012
* Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
*/
+#define pr_fmt(fmt) "trace_kprobe: " fmt
#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/uprobes.h>
#include <linux/namei.h>
#include <linux/string.h>
+#include <linux/rculist.h>
#include "trace_probe.h"
@@ -431,7 +433,8 @@ static int create_trace_uprobe(int argc, char **argv)
pr_info("Probe point is not specified.\n");
return -EINVAL;
}
- arg = strchr(argv[1], ':');
+ /* Find the last occurrence, in case the path contains ':' too. */
+ arg = strrchr(argv[1], ':');
if (!arg) {
ret = -EINVAL;
goto fail_address_parse;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1f9a31f934a4..685c50ae6300 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -24,7 +24,8 @@
#include <linux/tracepoint.h>
#include <linux/err.h>
#include <linux/slab.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
#include <linux/static_key.h>
extern struct tracepoint * const __start___tracepoints_ptrs[];
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 5c21f0535056..370724b45391 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -17,7 +17,9 @@
*/
#include <linux/kernel.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/cputime.h>
#include <linux/tsacct_kern.h>
#include <linux/acct.h>
#include <linux/jiffies.h>
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 95c6336fc2b3..b4eeee03934f 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -8,6 +8,7 @@
#include <linux/stat.h>
#include <linux/sysctl.h>
#include <linux/slab.h>
+#include <linux/cred.h>
#include <linux/hash.h>
#include <linux/user_namespace.h>
@@ -57,7 +58,7 @@ static struct ctl_table_root set_root = {
static int zero = 0;
static int int_max = INT_MAX;
-#define UCOUNT_ENTRY(name) \
+#define UCOUNT_ENTRY(name) \
{ \
.procname = name, \
.maxlen = sizeof(int), \
@@ -74,6 +75,10 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_net_namespaces"),
UCOUNT_ENTRY("max_mnt_namespaces"),
UCOUNT_ENTRY("max_cgroup_namespaces"),
+#ifdef CONFIG_INOTIFY_USER
+ UCOUNT_ENTRY("max_inotify_instances"),
+ UCOUNT_ENTRY("max_inotify_watches"),
+#endif
{ }
};
#endif /* CONFIG_SYSCTL */
@@ -139,7 +144,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
new->ns = ns;
new->uid = uid;
- atomic_set(&new->count, 0);
+ new->count = 0;
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -150,8 +155,10 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
ucounts = new;
}
}
- if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
+ if (ucounts->count == INT_MAX)
ucounts = NULL;
+ else
+ ucounts->count += 1;
spin_unlock_irq(&ucounts_lock);
return ucounts;
}
@@ -160,13 +167,15 @@ static void put_ucounts(struct ucounts *ucounts)
{
unsigned long flags;
- if (atomic_dec_and_test(&ucounts->count)) {
- spin_lock_irqsave(&ucounts_lock, flags);
+ spin_lock_irqsave(&ucounts_lock, flags);
+ ucounts->count -= 1;
+ if (!ucounts->count)
hlist_del_init(&ucounts->node);
- spin_unlock_irqrestore(&ucounts_lock, flags);
+ else
+ ucounts = NULL;
+ spin_unlock_irqrestore(&ucounts_lock, flags);
- kfree(ucounts);
- }
+ kfree(ucounts);
}
static inline bool atomic_inc_below(atomic_t *v, int u)
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 71645ae9303a..5c2dc5b2bf4f 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -12,6 +12,7 @@
#include <linux/init.h>
#include <linux/highuid.h>
#include <linux/security.h>
+#include <linux/cred.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
diff --git a/kernel/user.c b/kernel/user.c
index b069ccbfb0b0..00281add65b2 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/key.h>
+#include <linux/sched/user.h>
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 86b7854fec8e..2f735cbe05e8 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -8,6 +8,7 @@
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
+#include <linux/sched/signal.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
#include <linux/highuid.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 6976cd47dcf6..913fe4336d2b 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,8 +14,10 @@
#include <linux/utsname.h>
#include <linux/err.h>
#include <linux/slab.h>
+#include <linux/cred.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
+#include <linux/sched/task.h>
static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
{
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index c8eac43267e9..233cd8fc6910 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -14,6 +14,7 @@
#include <linux/utsname.h>
#include <linux/sysctl.h>
#include <linux/wait.h>
+#include <linux/rwsem.h>
#ifdef CONFIG_PROC_SYSCTL
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 63177be0159e..03e0b69bb5bf 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -19,8 +19,11 @@
#include <linux/sysctl.h>
#include <linux/smpboot.h>
#include <linux/sched/rt.h>
+#include <uapi/linux/sched/types.h>
#include <linux/tick.h>
#include <linux/workqueue.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/debug.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 12b8dd640786..54a427d1f344 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -13,6 +13,8 @@
#include <linux/nmi.h>
#include <linux/module.h>
+#include <linux/sched/debug.h>
+
#include <asm/irq_regs.h>
#include <linux/perf_event.h>
@@ -137,12 +139,14 @@ static void watchdog_overflow_callback(struct perf_event *event,
* Reduce the watchdog noise by only printing messages
* that are different from what cpu0 displayed.
*/
-static unsigned long cpu0_err;
+static unsigned long firstcpu_err;
+static atomic_t watchdog_cpus;
int watchdog_nmi_enable(unsigned int cpu)
{
struct perf_event_attr *wd_attr;
struct perf_event *event = per_cpu(watchdog_ev, cpu);
+ int firstcpu = 0;
/* nothing to do if the hard lockup detector is disabled */
if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
@@ -156,19 +160,22 @@ int watchdog_nmi_enable(unsigned int cpu)
if (event != NULL)
goto out_enable;
+ if (atomic_inc_return(&watchdog_cpus) == 1)
+ firstcpu = 1;
+
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
/* Try to register using hardware perf events */
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
- /* save cpu0 error for future comparision */
- if (cpu == 0 && IS_ERR(event))
- cpu0_err = PTR_ERR(event);
+ /* save the first cpu's error for future comparision */
+ if (firstcpu && IS_ERR(event))
+ firstcpu_err = PTR_ERR(event);
if (!IS_ERR(event)) {
- /* only print for cpu0 or different than cpu0 */
- if (cpu == 0 || cpu0_err)
+ /* only print for the first cpu initialized */
+ if (firstcpu || firstcpu_err)
pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
goto out_save;
}
@@ -186,7 +193,7 @@ int watchdog_nmi_enable(unsigned int cpu)
smp_mb__after_atomic();
/* skip displaying the same error again */
- if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+ if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
return PTR_ERR(event);
/* vary the KERN level based on the returned errno */
@@ -222,9 +229,9 @@ void watchdog_nmi_disable(unsigned int cpu)
/* should be in cleanup, but blocks oprofile */
perf_event_release_kernel(event);
- }
- if (cpu == 0) {
+
/* watchdog_nmi_enable() expects this to be zero initially. */
- cpu0_err = 0;
+ if (atomic_dec_and_test(&watchdog_cpus))
+ firstcpu_err = 0;
}
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 072cbc9b175d..c0168b7da1ea 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1507,6 +1507,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
+ WARN_ON_ONCE(!wq);
WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
timer->data != (unsigned long)dwork);
WARN_ON_ONCE(timer_pending(timer));