From 61d1b6a42fec61c5065f54cc62cef02b483c69fb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Jun 2016 22:47:12 +0200
Subject: bpf, maps: add release callback

Add a release callback for maps that is invoked when the last
reference to its struct file is gone and the struct file about
to be released by vfs. The handler will be used by fd array maps.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 46ecce4b79ed..fc3adcd064b1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map)
 
 static int bpf_map_release(struct inode *inode, struct file *filp)
 {
-	bpf_map_put_with_uref(filp->private_data);
+	struct bpf_map *map = filp->private_data;
+
+	if (map->ops->map_release)
+		map->ops->map_release(map, filp);
+
+	bpf_map_put_with_uref(map);
 	return 0;
 }
 
-- 
cgit v1.2.3


From d056a788765e67773124f520159185bc89f5d1ad Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Jun 2016 22:47:13 +0200
Subject: bpf, maps: extend map_fd_get_ptr arguments

This patch extends map_fd_get_ptr() callback that is used by fd array
maps, so that struct file pointer from the related map can be passed
in. It's safe to remove map_update_elem() callback for the two maps since
this is only allowed from syscall side, but not from eBPF programs for these
two map types. Like in per-cpu map case, bpf_fd_array_map_update_elem()
needs to be called directly here due to the extra argument.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c | 16 +++++++++-------
 kernel/bpf/syscall.c  |  6 ++++++
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 76d5a794e426..bfedcbdb4d84 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 }
 
 /* only called from syscall */
-static int fd_array_map_update_elem(struct bpf_map *map, void *key,
-				    void *value, u64 map_flags)
+int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
+				 void *key, void *value, u64 map_flags)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	void *new_ptr, *old_ptr;
@@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key,
 		return -E2BIG;
 
 	ufd = *(u32 *)value;
-	new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+	new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
 	if (IS_ERR(new_ptr))
 		return PTR_ERR(new_ptr);
 
@@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 	}
 }
 
-static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void *prog_fd_array_get_ptr(struct bpf_map *map,
+				   struct file *map_file, int fd)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	struct bpf_prog *prog = bpf_prog_get(fd);
+
 	if (IS_ERR(prog))
 		return prog;
 
@@ -382,6 +384,7 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
 		bpf_prog_put(prog);
 		return ERR_PTR(-EINVAL);
 	}
+
 	return prog;
 }
 
@@ -407,7 +410,6 @@ static const struct bpf_map_ops prog_array_ops = {
 	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
-	.map_update_elem = fd_array_map_update_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = prog_fd_array_get_ptr,
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
@@ -431,7 +433,8 @@ static void perf_event_array_map_free(struct bpf_map *map)
 	fd_array_map_free(map);
 }
 
-static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
+					 struct file *map_file, int fd)
 {
 	struct perf_event *event;
 	const struct perf_event_attr *attr;
@@ -474,7 +477,6 @@ static const struct bpf_map_ops perf_event_array_ops = {
 	.map_free = perf_event_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
-	.map_update_elem = fd_array_map_update_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
 	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fc3adcd064b1..c23a4e9311b3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -392,6 +392,12 @@ static int map_update_elem(union bpf_attr *attr)
 		err = bpf_percpu_hash_update(map, key, value, attr->flags);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_update(map, key, value, attr->flags);
+	} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
+		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+		rcu_read_lock();
+		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+						   attr->flags);
+		rcu_read_unlock();
 	} else {
 		rcu_read_lock();
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
-- 
cgit v1.2.3


From 3b1efb196eee45b2f0c4994e0c43edb5e367f620 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 15 Jun 2016 22:47:14 +0200
Subject: bpf, maps: flush own entries on perf map release

The behavior of perf event arrays are quite different from all
others as they are tightly coupled to perf event fds, f.e. shown
recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array
to use struct file") to make refcounting on perf event more robust.
A remaining issue that the current code still has is that since
additions to the perf event array take a reference on the struct
file via perf_event_get() and are only released via fput() (that
cleans up the perf event eventually via perf_event_release_kernel())
when the element is either manually removed from the map from user
space or automatically when the last reference on the perf event
map is dropped. However, this leads us to dangling struct file's
when the map gets pinned after the application owning the perf
event descriptor exits, and since the struct file reference will
in such case only be manually dropped or via pinned file removal,
it leads to the perf event living longer than necessary, consuming
needlessly resources for that time.

Relations between perf event fds and bpf perf event map fds can be
rather complex. F.e. maps can act as demuxers among different perf
event fds that can possibly be owned by different threads and based
on the index selection from the program, events get dispatched to
one of the per-cpu fd endpoints. One perf event fd (or, rather a
per-cpu set of them) can also live in multiple perf event maps at
the same time, listening for events. Also, another requirement is
that perf event fds can get closed from application side after they
have been attached to the perf event map, so that on exit perf event
map will take care of dropping their references eventually. Likewise,
when such maps are pinned, the intended behavior is that a user
application does bpf_obj_get(), puts its fds in there and on exit
when fd is released, they are dropped from the map again, so the map
acts rather as connector endpoint. This also makes perf event maps
inherently different from program arrays as described in more detail
in commit c9da161c6517 ("bpf: fix clearing on persistent program
array maps").

To tackle this, map entries are marked by the map struct file that
added the element to the map. And when the last reference to that map
struct file is released from user space, then the tracked entries
are purged from the map. This is okay, because new map struct files
instances resp. frontends to the anon inode are provided via
bpf_map_new_fd() that is called when we invoke bpf_obj_get_user()
for retrieving a pinned map, but also when an initial instance is
created via map_create(). The rest is resolved by the vfs layer
automatically for us by keeping reference count on the map's struct
file. Any concurrent updates on the map slot are fine as well, it
just means that perf_event_fd_array_release() needs to delete less
of its own entires.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c    | 102 ++++++++++++++++++++++++++++++++++-------------
 kernel/trace/bpf_trace.c |  18 ++++-----
 2 files changed, 82 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index bfedcbdb4d84..5af30732697b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -427,59 +427,105 @@ static int __init register_prog_array_map(void)
 }
 late_initcall(register_prog_array_map);
 
-static void perf_event_array_map_free(struct bpf_map *map)
+static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
+						   struct file *map_file)
 {
-	bpf_fd_array_map_clear(map);
-	fd_array_map_free(map);
+	struct bpf_event_entry *ee;
+
+	ee = kzalloc(sizeof(*ee), GFP_KERNEL);
+	if (ee) {
+		ee->event = perf_file->private_data;
+		ee->perf_file = perf_file;
+		ee->map_file = map_file;
+	}
+
+	return ee;
+}
+
+static void __bpf_event_entry_free(struct rcu_head *rcu)
+{
+	struct bpf_event_entry *ee;
+
+	ee = container_of(rcu, struct bpf_event_entry, rcu);
+	fput(ee->perf_file);
+	kfree(ee);
+}
+
+static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
+{
+	call_rcu(&ee->rcu, __bpf_event_entry_free);
 }
 
 static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
 					 struct file *map_file, int fd)
 {
-	struct perf_event *event;
 	const struct perf_event_attr *attr;
-	struct file *file;
+	struct bpf_event_entry *ee;
+	struct perf_event *event;
+	struct file *perf_file;
 
-	file = perf_event_get(fd);
-	if (IS_ERR(file))
-		return file;
+	perf_file = perf_event_get(fd);
+	if (IS_ERR(perf_file))
+		return perf_file;
 
-	event = file->private_data;
+	event = perf_file->private_data;
+	ee = ERR_PTR(-EINVAL);
 
 	attr = perf_event_attrs(event);
-	if (IS_ERR(attr))
-		goto err;
-
-	if (attr->inherit)
-		goto err;
-
-	if (attr->type == PERF_TYPE_RAW)
-		return file;
-
-	if (attr->type == PERF_TYPE_HARDWARE)
-		return file;
+	if (IS_ERR(attr) || attr->inherit)
+		goto err_out;
+
+	switch (attr->type) {
+	case PERF_TYPE_SOFTWARE:
+		if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
+			goto err_out;
+		/* fall-through */
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+		ee = bpf_event_entry_gen(perf_file, map_file);
+		if (ee)
+			return ee;
+		ee = ERR_PTR(-ENOMEM);
+		/* fall-through */
+	default:
+		break;
+	}
 
-	if (attr->type == PERF_TYPE_SOFTWARE &&
-	    attr->config == PERF_COUNT_SW_BPF_OUTPUT)
-		return file;
-err:
-	fput(file);
-	return ERR_PTR(-EINVAL);
+err_out:
+	fput(perf_file);
+	return ee;
 }
 
 static void perf_event_fd_array_put_ptr(void *ptr)
 {
-	fput((struct file *)ptr);
+	bpf_event_entry_free_rcu(ptr);
+}
+
+static void perf_event_fd_array_release(struct bpf_map *map,
+					struct file *map_file)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_event_entry *ee;
+	int i;
+
+	rcu_read_lock();
+	for (i = 0; i < array->map.max_entries; i++) {
+		ee = READ_ONCE(array->ptrs[i]);
+		if (ee && ee->map_file == map_file)
+			fd_array_map_delete_elem(map, &i);
+	}
+	rcu_read_unlock();
 }
 
 static const struct bpf_map_ops perf_event_array_ops = {
 	.map_alloc = fd_array_map_alloc,
-	.map_free = perf_event_array_map_free,
+	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
 	.map_lookup_elem = fd_array_map_lookup_elem,
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
 	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+	.map_release = perf_event_fd_array_release,
 };
 
 static struct bpf_map_type_list perf_event_array_type __read_mostly = {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 720b7bb01d43..037ea6ea3cb2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -192,18 +192,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 {
 	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_event_entry *ee;
 	struct perf_event *event;
-	struct file *file;
 
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-	file = READ_ONCE(array->ptrs[index]);
-	if (unlikely(!file))
+	ee = READ_ONCE(array->ptrs[index]);
+	if (unlikely(!ee))
 		return -ENOENT;
 
-	event = file->private_data;
-
+	event = ee->event;
 	/* make sure event is local and doesn't have pmu::count */
 	if (event->oncpu != smp_processor_id() ||
 	    event->pmu->count)
@@ -233,8 +232,8 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	u64 index = flags & BPF_F_INDEX_MASK;
 	void *data = (void *) (long) r4;
 	struct perf_sample_data sample_data;
+	struct bpf_event_entry *ee;
 	struct perf_event *event;
-	struct file *file;
 	struct perf_raw_record raw = {
 		.size = size,
 		.data = data,
@@ -247,12 +246,11 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-	file = READ_ONCE(array->ptrs[index]);
-	if (unlikely(!file))
+	ee = READ_ONCE(array->ptrs[index]);
+	if (unlikely(!ee))
 		return -ENOENT;
 
-	event = file->private_data;
-
+	event = ee->event;
 	if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
 		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
 		return -EINVAL;
-- 
cgit v1.2.3


From 1ca1cc98bf7418c680415bfce05699f67510a7fd Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 28 Jun 2016 12:18:23 +0200
Subject: bpf: minor cleanups on fd maps and helpers

Some minor cleanups: i) Remove the unlikely() from fd array map lookups
and let the CPU branch predictor do its job, scenarios where there is not
always a map entry are very well valid. ii) Move the attribute type check
in the bpf_perf_event_read() helper a bit earlier so it's consistent wrt
checks with bpf_perf_event_output() helper as well. iii) remove some
comments that are self-documenting in kprobe_prog_is_valid_access() and
therefore make it consistent to tp_prog_is_valid_access() as well.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c        |  3 +--
 kernel/trace/bpf_trace.c | 18 ++++++------------
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b94a36550591..d638062f66d6 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -719,14 +719,13 @@ select_insn:
 
 		if (unlikely(index >= array->map.max_entries))
 			goto out;
-
 		if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
 			goto out;
 
 		tail_call_cnt++;
 
 		prog = READ_ONCE(array->ptrs[index]);
-		if (unlikely(!prog))
+		if (!prog)
 			goto out;
 
 		/* ARG1 at this point is guaranteed to point to CTX from
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3de25fbed785..4e61f74a5d73 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -199,19 +199,19 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 		return -E2BIG;
 
 	ee = READ_ONCE(array->ptrs[index]);
-	if (unlikely(!ee))
+	if (!ee)
 		return -ENOENT;
 
 	event = ee->event;
+	if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
+		     event->attr.type != PERF_TYPE_RAW))
+		return -EINVAL;
+
 	/* make sure event is local and doesn't have pmu::count */
 	if (event->oncpu != smp_processor_id() ||
 	    event->pmu->count)
 		return -EINVAL;
 
-	if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
-		     event->attr.type != PERF_TYPE_RAW))
-		return -EINVAL;
-
 	/*
 	 * we don't know if the function is run successfully by the
 	 * return value. It can be judged in other places, such as
@@ -251,7 +251,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 		return -E2BIG;
 
 	ee = READ_ONCE(array->ptrs[index]);
-	if (unlikely(!ee))
+	if (!ee)
 		return -ENOENT;
 
 	event = ee->event;
@@ -354,18 +354,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
 					enum bpf_reg_type *reg_type)
 {
-	/* check bounds */
 	if (off < 0 || off >= sizeof(struct pt_regs))
 		return false;
-
-	/* only read is allowed */
 	if (type != BPF_READ)
 		return false;
-
-	/* disallow misaligned access */
 	if (off % size != 0)
 		return false;
-
 	return true;
 }
 
-- 
cgit v1.2.3


From d79313303181d357d293453fb8486bdc87bfd2f4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 28 Jun 2016 12:18:24 +0200
Subject: bpf, trace: fetch current cpu only once

We currently have two invocations, which is unnecessary. Fetch it only
once and use the smp_processor_id() variant, so we also get preemption
checks along with it when DEBUG_PREEMPT is set.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4e61f74a5d73..505f9e9cdb3b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -233,6 +233,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	struct pt_regs *regs = (struct pt_regs *) (long) r1;
 	struct bpf_map *map = (struct bpf_map *) (long) r2;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
 	void *data = (void *) (long) r4;
 	struct perf_sample_data sample_data;
@@ -246,7 +247,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
 		return -EINVAL;
 	if (index == BPF_F_CURRENT_CPU)
-		index = raw_smp_processor_id();
+		index = cpu;
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
@@ -259,7 +260,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
 		return -EINVAL;
 
-	if (unlikely(event->oncpu != smp_processor_id()))
+	if (unlikely(event->oncpu != cpu))
 		return -EOPNOTSUPP;
 
 	perf_sample_data_init(&sample_data, 0, 0);
-- 
cgit v1.2.3


From 6816a7ffce32e999601825ddfd887f36d3052932 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 28 Jun 2016 12:18:25 +0200
Subject: bpf, trace: add BPF_F_CURRENT_CPU flag for bpf_perf_event_read

Follow-up commit to 1e33759c788c ("bpf, trace: add BPF_F_CURRENT_CPU
flag for bpf_perf_event_output") to add the same functionality into
bpf_perf_event_read() helper. The split of index into flags and index
component is also safe here, since such large maps are rejected during
map allocation time.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 505f9e9cdb3b..19c5b4a5c3eb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -188,13 +188,19 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
-static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5)
 {
 	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	unsigned int cpu = smp_processor_id();
+	u64 index = flags & BPF_F_INDEX_MASK;
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
 
+	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+		return -EINVAL;
+	if (index == BPF_F_CURRENT_CPU)
+		index = cpu;
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
@@ -208,8 +214,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 		return -EINVAL;
 
 	/* make sure event is local and doesn't have pmu::count */
-	if (event->oncpu != smp_processor_id() ||
-	    event->pmu->count)
+	if (unlikely(event->oncpu != cpu || event->pmu->count))
 		return -EINVAL;
 
 	/*
-- 
cgit v1.2.3


From 80b48c445797a634d869c7e5a53e182ba2688931 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 28 Jun 2016 12:18:26 +0200
Subject: bpf: don't use raw processor id in generic helper

Use smp_processor_id() for the generic helper bpf_get_smp_processor_id()
instead of the raw variant. This allows for preemption checks when we
have DEBUG_PREEMPT, and otherwise uses the raw variant anyway. We only
need to keep the raw variant for socket filters, but we can reuse the
helper that is already there from cBPF side.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/helpers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index ad7a0573f71b..1ea3afba1a4f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
 
 static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
-	return raw_smp_processor_id();
+	return smp_processor_id();
 }
 
 const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
-- 
cgit v1.2.3


From 1aacde3d22c42281236155c1ef6d7a5aa32a826b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 30 Jun 2016 17:24:43 +0200
Subject: bpf: generally move prog destruction to RCU deferral

Jann Horn reported following analysis that could potentially result
in a very hard to trigger (if not impossible) UAF race, to quote his
event timeline:

 - Set up a process with threads T1, T2 and T3
 - Let T1 set up a socket filter F1 that invokes another filter F2
   through a BPF map [tail call]
 - Let T1 trigger the socket filter via a unix domain socket write,
   don't wait for completion
 - Let T2 call PERF_EVENT_IOC_SET_BPF with F2, don't wait for completion
 - Now T2 should be behind bpf_prog_get(), but before bpf_prog_put()
 - Let T3 close the file descriptor for F2, dropping the reference
   count of F2 to 2
 - At this point, T1 should have looked up F2 from the map, but not
   finished executing it
 - Let T3 remove F2 from the BPF map, dropping the reference count of
   F2 to 1
 - Now T2 should call bpf_prog_put() (wrong BPF program type), dropping
   the reference count of F2 to 0 and scheduling bpf_prog_free_deferred()
   via schedule_work()
 - At this point, the BPF program could be freed
 - BPF execution is still running in a freed BPF program

While at PERF_EVENT_IOC_SET_BPF time it's only guaranteed that the perf
event fd we're doing the syscall on doesn't disappear from underneath us
for whole syscall time, it may not be the case for the bpf fd used as
an argument only after we did the put. It needs to be a valid fd pointing
to a BPF program at the time of the call to make the bpf_prog_get() and
while T2 gets preempted, F2 must have dropped reference to 1 on the other
CPU. The fput() from the close() in T3 should also add additionally delay
to the reference drop via exit_task_work() when bpf_prog_release() gets
called as well as scheduling bpf_prog_free_deferred().

That said, it makes nevertheless sense to move the BPF prog destruction
generally after RCU grace period to guarantee that such scenario above,
but also others as recently fixed in ceb56070359b ("bpf, perf: delay release
of BPF prog after grace period") with regards to tail calls won't happen.
Integrating bpf_prog_free_deferred() directly into the RCU callback is
not allowed since the invocation might happen from either softirq or
process context, so we're not permitted to block. Reviewing all bpf_prog_put()
invocations from eBPF side (note, cBPF -> eBPF progs don't use this for
their destruction) with call_rcu() look good to me.

Since we don't know whether at the time of attaching the program, we're
already part of a tail call map, we need to use RCU variant. However, due
to this, there won't be severely more stress on the RCU callback queue:
situations with above bpf_prog_get() and bpf_prog_put() combo in practice
normally won't lead to releases, but even if they would, enough effort/
cycles have to be put into loading a BPF program into the kernel already.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c |  4 +---
 kernel/bpf/syscall.c  | 13 +++----------
 kernel/events/core.c  |  2 +-
 3 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 5af30732697b..4ec57a649b1f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -390,9 +390,7 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
 
 static void prog_fd_array_put_ptr(void *ptr)
 {
-	struct bpf_prog *prog = ptr;
-
-	bpf_prog_put_rcu(prog);
+	bpf_prog_put(ptr);
 }
 
 /* decrement refcnt of all bpf_progs that are stored in this map */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c23a4e9311b3..f6806a1d7ed9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -623,7 +623,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
 	free_uid(user);
 }
 
-static void __prog_put_common(struct rcu_head *rcu)
+static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 {
 	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
 
@@ -632,17 +632,10 @@ static void __prog_put_common(struct rcu_head *rcu)
 	bpf_prog_free(aux->prog);
 }
 
-/* version of bpf_prog_put() that is called after a grace period */
-void bpf_prog_put_rcu(struct bpf_prog *prog)
-{
-	if (atomic_dec_and_test(&prog->aux->refcnt))
-		call_rcu(&prog->aux->rcu, __prog_put_common);
-}
-
 void bpf_prog_put(struct bpf_prog *prog)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt))
-		__prog_put_common(&prog->aux->rcu);
+		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_put);
 
@@ -650,7 +643,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 {
 	struct bpf_prog *prog = filp->private_data;
 
-	bpf_prog_put_rcu(prog);
+	bpf_prog_put(prog);
 	return 0;
 }
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 85cd41878a74..9c51ec3f0f44 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7529,7 +7529,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
 	prog = event->tp_event->prog;
 	if (prog) {
 		event->tp_event->prog = NULL;
-		bpf_prog_put_rcu(prog);
+		bpf_prog_put(prog);
 	}
 }
 
-- 
cgit v1.2.3


From 113214be7f6c98dd6d0435e4765aea8dea91662c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 30 Jun 2016 17:24:44 +0200
Subject: bpf: refactor bpf_prog_get and type check into helper

Since bpf_prog_get() and program type check is used in a couple of places,
refactor this into a small helper function that we can make use of. Since
the non RO prog->aux part is not used in performance critical paths and a
program destruction via RCU is rather very unlikley when doing the put, we
shouldn't have an issue just doing the bpf_prog_get() + prog->type != type
check, but actually not taking the ref at all (due to being in fdget() /
fdput() section of the bpf fd) is even cleaner and makes the diff smaller
as well, so just go for that. Callsites are changed to make use of the new
helper where possible.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f6806a1d7ed9..22863d9872b1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -657,7 +657,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog)
 				O_RDWR | O_CLOEXEC);
 }
 
-static struct bpf_prog *__bpf_prog_get(struct fd f)
+static struct bpf_prog *____bpf_prog_get(struct fd f)
 {
 	if (!f.file)
 		return ERR_PTR(-EBADF);
@@ -678,24 +678,35 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
 	return prog;
 }
 
-/* called by sockets/tracing/seccomp before attaching program to an event
- * pairs with bpf_prog_put()
- */
-struct bpf_prog *bpf_prog_get(u32 ufd)
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
 
-	prog = __bpf_prog_get(f);
+	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
+	if (type && prog->type != *type) {
+		prog = ERR_PTR(-EINVAL);
+		goto out;
+	}
 
 	prog = bpf_prog_inc(prog);
+out:
 	fdput(f);
-
 	return prog;
 }
-EXPORT_SYMBOL_GPL(bpf_prog_get);
+
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+	return __bpf_prog_get(ufd, NULL);
+}
+
+struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
+{
+	return __bpf_prog_get(ufd, &type);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
 #define	BPF_PROG_LOAD_LAST_FIELD kern_version
-- 
cgit v1.2.3


From 1f3fe7ebf6136c341012db9f554d4caa566fcbaa Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 30 Jun 2016 10:28:42 -0700
Subject: cgroup: Add cgroup_get_from_fd

Add a helper function to get a cgroup2 from a fd.  It will be
stored in a bpf array (BPF_MAP_TYPE_CGROUP_ARRAY) which will
be introduced in the later patch.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/cgroup.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 75c0ff00aca6..50787cd61da2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -62,6 +62,7 @@
 #include <linux/proc_ns.h>
 #include <linux/nsproxy.h>
 #include <linux/proc_ns.h>
+#include <linux/file.h>
 #include <net/sock.h>
 
 /*
@@ -6209,6 +6210,40 @@ struct cgroup *cgroup_get_from_path(const char *path)
 }
 EXPORT_SYMBOL_GPL(cgroup_get_from_path);
 
+/**
+ * cgroup_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup2_dir)
+ *
+ * Find the cgroup from a fd which should be obtained
+ * by opening a cgroup directory.  Returns a pointer to the
+ * cgroup on success. ERR_PTR is returned if the cgroup
+ * cannot be found.
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+	struct cgroup_subsys_state *css;
+	struct cgroup *cgrp;
+	struct file *f;
+
+	f = fget_raw(fd);
+	if (!f)
+		return ERR_PTR(-EBADF);
+
+	css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+	fput(f);
+	if (IS_ERR(css))
+		return ERR_CAST(css);
+
+	cgrp = css->cgroup;
+	if (!cgroup_on_dfl(cgrp)) {
+		cgroup_put(cgrp);
+		return ERR_PTR(-EBADF);
+	}
+
+	return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+
 /*
  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
  * definition in cgroup-defs.h.
-- 
cgit v1.2.3


From 4ed8ec521ed57c4e207ad464ca0388776de74d4b Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 30 Jun 2016 10:28:43 -0700
Subject: cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY

Add a BPF_MAP_TYPE_CGROUP_ARRAY and its bpf_map_ops's implementations.
To update an element, the caller is expected to obtain a cgroup2 backed
fd by open(cgroup2_dir) and then update the array with that fd.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c  |  3 ++-
 kernel/bpf/verifier.c |  2 ++
 3 files changed, 47 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 4ec57a649b1f..db1a743e3db2 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -537,3 +537,46 @@ static int __init register_perf_event_array_map(void)
 	return 0;
 }
 late_initcall(register_perf_event_array_map);
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
+				     struct file *map_file /* not used */,
+				     int fd)
+{
+	return cgroup_get_from_fd(fd);
+}
+
+static void cgroup_fd_array_put_ptr(void *ptr)
+{
+	/* cgroup_put free cgrp after a rcu grace period */
+	cgroup_put(ptr);
+}
+
+static void cgroup_fd_array_free(struct bpf_map *map)
+{
+	bpf_fd_array_map_clear(map);
+	fd_array_map_free(map);
+}
+
+static const struct bpf_map_ops cgroup_array_ops = {
+	.map_alloc = fd_array_map_alloc,
+	.map_free = cgroup_fd_array_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = cgroup_fd_array_get_ptr,
+	.map_fd_put_ptr = cgroup_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+	.ops = &cgroup_array_ops,
+	.type = BPF_MAP_TYPE_CGROUP_ARRAY,
+};
+
+static int __init register_cgroup_array_map(void)
+{
+	bpf_register_map_type(&cgroup_array_type);
+	return 0;
+}
+late_initcall(register_cgroup_array_map);
+#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 22863d9872b1..96d938a22050 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -393,7 +393,8 @@ static int map_update_elem(union bpf_attr *attr)
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_update(map, key, value, attr->flags);
 	} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
-		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
+		   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
 		rcu_read_lock();
 		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
 						   attr->flags);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eec9f90ba030..69ba2251a22b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1035,6 +1035,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (func_id != BPF_FUNC_get_stackid)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_CGROUP_ARRAY:
+		goto error;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 4a482f34afcc162d8456f449b137ec2a95be60d8 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 30 Jun 2016 10:28:44 -0700
Subject: cgroup: bpf: Add bpf_skb_in_cgroup_proto

Adds a bpf helper, bpf_skb_in_cgroup, to decide if a skb->sk
belongs to a descendant of a cgroup2.  It is similar to the
feature added in netfilter:
commit c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match")

The user is expected to populate a BPF_MAP_TYPE_CGROUP_ARRAY
which will be used by the bpf_skb_in_cgroup.

Modifications to the bpf verifier is to ensure BPF_MAP_TYPE_CGROUP_ARRAY
and bpf_skb_in_cgroup() are always used together.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 69ba2251a22b..e206c2181412 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1036,7 +1036,9 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_CGROUP_ARRAY:
-		goto error;
+		if (func_id != BPF_FUNC_skb_in_cgroup)
+			goto error;
+		break;
 	default:
 		break;
 	}
@@ -1056,6 +1058,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
 			goto error;
 		break;
+	case BPF_FUNC_skb_in_cgroup:
+		if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
+			goto error;
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 606274c5abd8e245add01bc7145a8cbb92b69ba8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Jul 2016 22:38:36 -0700
Subject: bpf: introduce bpf_get_current_task() helper

over time there were multiple requests to access different data
structures and fields of task_struct current, so finally add
the helper to access 'current' as-is. Tracing bpf programs will do
the rest of walking the pointers via bpf_probe_read().
Note that current can be null and bpf program has to deal it with,
but even dumb passing null into bpf_probe_read() is still safe.

Suggested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 19c5b4a5c3eb..094c716154ed 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -312,6 +312,17 @@ const struct bpf_func_proto *bpf_get_event_output_proto(void)
 	return &bpf_event_output_proto;
 }
 
+static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return (long) current;
+}
+
+static const struct bpf_func_proto bpf_get_current_task_proto = {
+	.func		= bpf_get_current_task,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+};
+
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -329,6 +340,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 		return &bpf_tail_call_proto;
 	case BPF_FUNC_get_current_pid_tgid:
 		return &bpf_get_current_pid_tgid_proto;
+	case BPF_FUNC_get_current_task:
+		return &bpf_get_current_task_proto;
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_current_comm:
-- 
cgit v1.2.3


From a536a6e13ecd0d6eb0ffc36c5d56555896617282 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 11 Jul 2016 12:51:01 -0400
Subject: bpf: make inode code explicitly non-modular

The Kconfig currently controlling compilation of this code is:

init/Kconfig:config BPF_SYSCALL
init/Kconfig:   bool "Enable bpf() system call"

...meaning that it currently is not being built as a module by anyone.

Lets remove the couple traces of modular infrastructure use, so that
when reading the driver there is no doubt it is builtin-only.

Note that MODULE_ALIAS is a no-op for non-modular code.

We replace module.h with init.h since the file does use __init.

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: netdev@vger.kernel.org
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 318858edb1cd..5967b870a895 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -11,7 +11,7 @@
  * version 2 as published by the Free Software Foundation.
  */
 
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/magic.h>
 #include <linux/major.h>
 #include <linux/mount.h>
@@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = {
 	.kill_sb	= kill_litter_super,
 };
 
-MODULE_ALIAS_FS("bpf");
-
 static int __init bpf_init(void)
 {
 	int ret;
-- 
cgit v1.2.3


From 7e3f977edd0bd9ea6104156feba95bb5ae9bdd38 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Jul 2016 18:08:03 +0200
Subject: perf, events: add non-linear data support for raw records

This patch adds support for non-linear data on raw records. It
extends raw records to have one or multiple fragments that will
be written linearly into the ring slot, where each fragment can
optionally have a custom callback handler to walk and extract
complex, possibly non-linear data.

If a callback handler is provided for a fragment, then the new
__output_custom() will be used instead of __output_copy() for
the perf_output_sample() part. perf_prepare_sample() does all
the size calculation only once, so perf_output_sample() doesn't
need to redo the same work anymore, meaning real_size and padding
will be cached in the raw record. The raw record becomes 32 bytes
in size without holes; to not increase it further and to avoid
doing unnecessary recalculations in fast-path, we can reuse
next pointer of the last fragment, idea here is borrowed from
ZERO_OR_NULL_PTR(), which should keep the perf_output_sample()
path for PERF_SAMPLE_RAW minimal.

This facility is needed for BPF's event output helper as a first
user that will, in a follow-up, add an additional perf_raw_frag
to its perf_raw_record in order to be able to more efficiently
dump skb context after a linear head meta data related to it.
skbs can be non-linear and thus need a custom output function to
dump buffers. Currently, the skb data needs to be copied twice;
with the help of __output_custom() this work only needs to be
done once. Future users could be things like XDP/BPF programs
that work on different context though and would thus also have
a different callback function.

The few users of raw records are adapted to initialize their frag
data from the raw record itself, no change in behavior for them.
The code is based upon a PoC diff provided by Peter Zijlstra [1].

  [1] http://thread.gmane.org/gmane.linux.network/421294

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/events/core.c     | 66 +++++++++++++++++++++++++++++++++---------------
 kernel/events/internal.h | 16 +++++++++---
 kernel/trace/bpf_trace.c |  6 +++--
 3 files changed, 62 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9c51ec3f0f44..b1891b6b5c1f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5553,16 +5553,26 @@ void perf_output_sample(struct perf_output_handle *handle,
 	}
 
 	if (sample_type & PERF_SAMPLE_RAW) {
-		if (data->raw) {
-			u32 raw_size = data->raw->size;
-			u32 real_size = round_up(raw_size + sizeof(u32),
-						 sizeof(u64)) - sizeof(u32);
-			u64 zero = 0;
-
-			perf_output_put(handle, real_size);
-			__output_copy(handle, data->raw->data, raw_size);
-			if (real_size - raw_size)
-				__output_copy(handle, &zero, real_size - raw_size);
+		struct perf_raw_record *raw = data->raw;
+
+		if (raw) {
+			struct perf_raw_frag *frag = &raw->frag;
+
+			perf_output_put(handle, raw->size);
+			do {
+				if (frag->copy) {
+					__output_custom(handle, frag->copy,
+							frag->data, frag->size);
+				} else {
+					__output_copy(handle, frag->data,
+						      frag->size);
+				}
+				if (perf_raw_frag_last(frag))
+					break;
+				frag = frag->next;
+			} while (1);
+			if (frag->pad)
+				__output_skip(handle, NULL, frag->pad);
 		} else {
 			struct {
 				u32	size;
@@ -5687,14 +5697,28 @@ void perf_prepare_sample(struct perf_event_header *header,
 	}
 
 	if (sample_type & PERF_SAMPLE_RAW) {
-		int size = sizeof(u32);
-
-		if (data->raw)
-			size += data->raw->size;
-		else
-			size += sizeof(u32);
+		struct perf_raw_record *raw = data->raw;
+		int size;
+
+		if (raw) {
+			struct perf_raw_frag *frag = &raw->frag;
+			u32 sum = 0;
+
+			do {
+				sum += frag->size;
+				if (perf_raw_frag_last(frag))
+					break;
+				frag = frag->next;
+			} while (1);
+
+			size = round_up(sum + sizeof(u32), sizeof(u64));
+			raw->size = size - sizeof(u32);
+			frag->pad = raw->size - sum;
+		} else {
+			size = sizeof(u64);
+		}
 
-		header->size += round_up(size, sizeof(u64));
+		header->size += size;
 	}
 
 	if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -7331,7 +7355,7 @@ static struct pmu perf_swevent = {
 static int perf_tp_filter_match(struct perf_event *event,
 				struct perf_sample_data *data)
 {
-	void *record = data->raw->data;
+	void *record = data->raw->frag.data;
 
 	/* only top level events have filters set */
 	if (event->parent)
@@ -7387,8 +7411,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 	struct perf_event *event;
 
 	struct perf_raw_record raw = {
-		.size = entry_size,
-		.data = record,
+		.frag = {
+			.size = entry_size,
+			.data = record,
+		},
 	};
 
 	perf_sample_data_init(&data, 0, 0);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 05f9f6d626df..2417eb5512cd 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -123,10 +123,7 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
 	return rb->aux_nr_pages << PAGE_SHIFT;
 }
 
-#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)			\
-static inline unsigned long						\
-func_name(struct perf_output_handle *handle,				\
-	  const void *buf, unsigned long len)				\
+#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func)				\
 {									\
 	unsigned long size, written;					\
 									\
@@ -152,6 +149,17 @@ func_name(struct perf_output_handle *handle,				\
 	return len;							\
 }
 
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)			\
+static inline unsigned long						\
+func_name(struct perf_output_handle *handle,				\
+	  const void *buf, unsigned long len)				\
+__DEFINE_OUTPUT_COPY_BODY(memcpy_func)
+
+static inline unsigned long
+__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
+		const void *buf, unsigned long len)
+__DEFINE_OUTPUT_COPY_BODY(copy_func)
+
 static inline unsigned long
 memcpy_common(void *dst, const void *src, unsigned long n)
 {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 094c716154ed..35ab1b2b041b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -245,8 +245,10 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
 	struct perf_raw_record raw = {
-		.size = size,
-		.data = data,
+		.frag = {
+			.size = size,
+			.data = data,
+		},
 	};
 
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
-- 
cgit v1.2.3


From 8e7a3920ac277dd4e690c0e70c9750176e3acb83 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Jul 2016 18:08:04 +0200
Subject: bpf, perf: split bpf_perf_event_output

Split the bpf_perf_event_output() helper as a preparation into
two parts. The new bpf_perf_event_output() will prepare the raw
record itself and test for unknown flags from BPF trace context,
where the __bpf_perf_event_output() does the core work. The
latter will be reused later on from bpf_event_output() directly.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 35ab1b2b041b..c35883a9bc11 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -233,26 +233,17 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+static __always_inline u64
+__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
+			u64 flags, struct perf_raw_record *raw)
 {
-	struct pt_regs *regs = (struct pt_regs *) (long) r1;
-	struct bpf_map *map = (struct bpf_map *) (long) r2;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
-	void *data = (void *) (long) r4;
 	struct perf_sample_data sample_data;
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
-	struct perf_raw_record raw = {
-		.frag = {
-			.size = size,
-			.data = data,
-		},
-	};
 
-	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
-		return -EINVAL;
 	if (index == BPF_F_CURRENT_CPU)
 		index = cpu;
 	if (unlikely(index >= array->map.max_entries))
@@ -271,11 +262,29 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 		return -EOPNOTSUPP;
 
 	perf_sample_data_init(&sample_data, 0, 0);
-	sample_data.raw = &raw;
+	sample_data.raw = raw;
 	perf_event_output(event, &sample_data, regs);
 	return 0;
 }
 
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+	struct pt_regs *regs = (struct pt_regs *)(long) r1;
+	struct bpf_map *map  = (struct bpf_map *)(long) r2;
+	void *data = (void *)(long) r4;
+	struct perf_raw_record raw = {
+		.frag = {
+			.size = size,
+			.data = data,
+		},
+	};
+
+	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+		return -EINVAL;
+
+	return __bpf_perf_event_output(regs, map, flags, &raw);
+}
+
 static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.func		= bpf_perf_event_output,
 	.gpl_only	= true,
-- 
cgit v1.2.3


From 555c8a8623a3a87b3c990ba30b7fd2e5914e41d2 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 14 Jul 2016 18:08:05 +0200
Subject: bpf: avoid stack copy and use skb ctx for event output

This work addresses a couple of issues bpf_skb_event_output()
helper currently has: i) We need two copies instead of just a
single one for the skb data when it should be part of a sample.
The data can be non-linear and thus needs to be extracted via
bpf_skb_load_bytes() helper first, and then copied once again
into the ring buffer slot. ii) Since bpf_skb_load_bytes()
currently needs to be used first, the helper needs to see a
constant size on the passed stack buffer to make sure BPF
verifier can do sanity checks on it during verification time.
Thus, just passing skb->len (or any other non-constant value)
wouldn't work, but changing bpf_skb_load_bytes() is also not
the proper solution, since the two copies are generally still
needed. iii) bpf_skb_load_bytes() is just for rather small
buffers like headers, since they need to sit on the limited
BPF stack anyway. Instead of working around in bpf_skb_load_bytes(),
this work improves the bpf_skb_event_output() helper to address
all 3 at once.

We can make use of the passed in skb context that we have in
the helper anyway, and use some of the reserved flag bits as
a length argument. The helper will use the new __output_custom()
facility from perf side with bpf_skb_copy() as callback helper
to walk and extract the data. It will pass the data for setup
to bpf_event_output(), which generates and pushes the raw record
with an additional frag part. The linear data used in the first
frag of the record serves as programmatically defined meta data
passed along with the appended sample.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c        |  6 ++++--
 kernel/trace/bpf_trace.c | 33 +++++++++++++++------------------
 2 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d638062f66d6..03fd23d4d587 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1054,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 	return NULL;
 }
 
-const struct bpf_func_proto * __weak bpf_get_event_output_proto(void)
+u64 __weak
+bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+		 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
-	return NULL;
+	return -ENOTSUPP;
 }
 
 /* Always built-in helper functions. */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c35883a9bc11..ebfbb7dd7033 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -298,29 +298,26 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 
 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
 
-static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
 	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+	struct perf_raw_frag frag = {
+		.copy		= ctx_copy,
+		.size		= ctx_size,
+		.data		= ctx,
+	};
+	struct perf_raw_record raw = {
+		.frag = {
+			.next	= ctx_size ? &frag : NULL,
+			.size	= meta_size,
+			.data	= meta,
+		},
+	};
 
 	perf_fetch_caller_regs(regs);
 
-	return bpf_perf_event_output((long)regs, r2, flags, r4, size);
-}
-
-static const struct bpf_func_proto bpf_event_output_proto = {
-	.func		= bpf_event_output,
-	.gpl_only	= true,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_CONST_MAP_PTR,
-	.arg3_type	= ARG_ANYTHING,
-	.arg4_type	= ARG_PTR_TO_STACK,
-	.arg5_type	= ARG_CONST_STACK_SIZE,
-};
-
-const struct bpf_func_proto *bpf_get_event_output_proto(void)
-{
-	return &bpf_event_output_proto;
+	return __bpf_perf_event_output(regs, map, flags, &raw);
 }
 
 static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-- 
cgit v1.2.3


From 858d68f10238fdd1ebdd0096f912f063e97c6766 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 16 Jul 2016 01:15:55 +0200
Subject: bpf: bpf_event_entry_gen's alloc needs to be in atomic context

Should have been obvious, only called from bpf() syscall via map_update_elem()
that calls bpf_fd_array_map_update_elem() under RCU read lock and thus this
must also be in GFP_ATOMIC, of course.

Fixes: 3b1efb196eee ("bpf, maps: flush own entries on perf map release")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index db1a743e3db2..633a650d7aeb 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -430,7 +430,7 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
 {
 	struct bpf_event_entry *ee;
 
-	ee = kzalloc(sizeof(*ee), GFP_KERNEL);
+	ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
 	if (ee) {
 		ee->event = perf_file->private_data;
 		ee->perf_file = perf_file;
-- 
cgit v1.2.3


From 183fc1537ec39be242dc8b619f71fc11b393d295 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 18 Jul 2016 15:50:58 -0700
Subject: kernel/trace/bpf_trace.c: work around gcc-4.4.4 anon union
 initialization bug

kernel/trace/bpf_trace.c: In function 'bpf_event_output':
kernel/trace/bpf_trace.c:312: error: unknown field 'next' specified in initializer
kernel/trace/bpf_trace.c:312: warning: missing braces around initializer
kernel/trace/bpf_trace.c:312: warning: (near initialization for 'raw.frag.<anonymous>')

Fixes: 555c8a8623a3a87 ("bpf: avoid stack copy and use skb ctx for event output")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ebfbb7dd7033..a12bbd32c0a6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -309,7 +309,9 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 	};
 	struct perf_raw_record raw = {
 		.frag = {
-			.next	= ctx_size ? &frag : NULL,
+			{
+				.next	= ctx_size ? &frag : NULL,
+			},
 			.size	= meta_size,
 			.data	= meta,
 		},
-- 
cgit v1.2.3


From 59d3656d5bf504f771fc44fdbc7a9a8590795f22 Mon Sep 17 00:00:00 2001
From: Brenden Blanco <bblanco@plumgrid.com>
Date: Tue, 19 Jul 2016 12:16:46 -0700
Subject: bpf: add bpf_prog_add api for bulk prog refcnt

A subsystem may need to store many copies of a bpf program, each
deserving its own reference. Rather than requiring the caller to loop
one by one (with possible mid-loop failure), add a bulk bpf_prog_add
api.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 96d938a22050..228f962447a5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -670,14 +670,20 @@ static struct bpf_prog *____bpf_prog_get(struct fd f)
 	return f.file->private_data;
 }
 
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
 {
-	if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
-		atomic_dec(&prog->aux->refcnt);
+	if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
+		atomic_sub(i, &prog->aux->refcnt);
 		return ERR_PTR(-EBUSY);
 	}
 	return prog;
 }
+EXPORT_SYMBOL_GPL(bpf_prog_add);
+
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+	return bpf_prog_add(prog, 1);
+}
 
 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 {
-- 
cgit v1.2.3


From 6a773a15a1e8874e5eccd2f29190c31085912c95 Mon Sep 17 00:00:00 2001
From: Brenden Blanco <bblanco@plumgrid.com>
Date: Tue, 19 Jul 2016 12:16:47 -0700
Subject: bpf: add XDP prog type for early driver filter

Add a new bpf prog type that is intended to run in early stages of the
packet rx path. Only minimal packet metadata will be available, hence a
new context type, struct xdp_md, is exposed to userspace. So far only
expose the packet start and end pointers, and only in read mode.

An XDP program must return one of the well known enum values, all other
return codes are reserved for future use. Unfortunately, this
restriction is hard to enforce at verification time, so take the
approach of warning at runtime when such programs are encountered. Out
of bounds return codes should alias to XDP_ABORTED.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e206c2181412..a8d67d097b0d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -713,6 +713,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
 	switch (env->prog->type) {
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
+	case BPF_PROG_TYPE_XDP:
 		break;
 	default:
 		verbose("verifier is misconfigured\n");
-- 
cgit v1.2.3


From 4acf6c0b84c91243c705303cd9ff16421914150d Mon Sep 17 00:00:00 2001
From: Brenden Blanco <bblanco@plumgrid.com>
Date: Tue, 19 Jul 2016 12:16:56 -0700
Subject: bpf: enable direct packet data write for xdp progs

For forwarding to be effective, XDP programs should be allowed to
rewrite packet data.

This requires that the drivers supporting XDP must all map the packet
memory as TODEVICE or BIDIRECTIONAL before invoking the program.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a8d67d097b0d..f72f23b8fdab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -653,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
 
 #define MAX_PACKET_OFF 0xffff
 
+static bool may_write_pkt_data(enum bpf_prog_type type)
+{
+	switch (type) {
+	case BPF_PROG_TYPE_XDP:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int check_packet_access(struct verifier_env *env, u32 regno, int off,
 			       int size)
 {
@@ -806,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			err = check_stack_read(state, off, size, value_regno);
 		}
 	} else if (state->regs[regno].type == PTR_TO_PACKET) {
-		if (t == BPF_WRITE) {
+		if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
 		}
+		if (t == BPF_WRITE && value_regno >= 0 &&
+		    is_pointer_value(env, value_regno)) {
+			verbose("R%d leaks addr into packet\n", value_regno);
+			return -EACCES;
+		}
 		err = check_packet_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
-- 
cgit v1.2.3


From aa7145c16d6bf086538ad7eb20c807513bfa5efc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 22 Jul 2016 01:19:42 +0200
Subject: bpf, events: fix offset in skb copy handler

This patch fixes the __output_custom() routine we currently use with
bpf_skb_copy(). I missed that when len is larger than the size of the
current handle, we can issue multiple invocations of copy_func, and
__output_custom() advances destination but also source buffer by the
written amount of bytes. When we have __output_custom(), this is actually
wrong since in that case the source buffer points to a non-linear object,
in our case an skb, which the copy_func helper is supposed to walk.
Therefore, since this is non-linear we thus need to pass the offset into
the helper, so that copy_func can use it for extracting the data from
the source object.

Therefore, adjust the callback signatures properly and pass offset
into the skb_header_pointer() invoked from bpf_skb_copy() callback. The
__DEFINE_OUTPUT_COPY_BODY() is adjusted to accommodate for two things:
i) to pass in whether we should advance source buffer or not; this is
a compile-time constant condition, ii) to pass in the offset for
__output_custom(), which we do with help of __VA_ARGS__, so everything
can stay inlined as is currently. Both changes allow for adapting the
__output_* fast-path helpers w/o extra overhead.

Fixes: 555c8a8623a3 ("bpf: avoid stack copy and use skb ctx for event output")
Fixes: 7e3f977edd0b ("perf, events: add non-linear data support for raw records")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/events/internal.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 2417eb5512cd..486fd78eb8d5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -123,18 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
 	return rb->aux_nr_pages << PAGE_SHIFT;
 }
 
-#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func)				\
+#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...)	\
 {									\
 	unsigned long size, written;					\
 									\
 	do {								\
 		size    = min(handle->size, len);			\
-		written = memcpy_func(handle->addr, buf, size);		\
+		written = memcpy_func(__VA_ARGS__);			\
 		written = size - written;				\
 									\
 		len -= written;						\
 		handle->addr += written;				\
-		buf += written;						\
+		if (advance_buf)					\
+			buf += written;					\
 		handle->size -= written;				\
 		if (!handle->size) {					\
 			struct ring_buffer *rb = handle->rb;		\
@@ -153,12 +154,16 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
 static inline unsigned long						\
 func_name(struct perf_output_handle *handle,				\
 	  const void *buf, unsigned long len)				\
-__DEFINE_OUTPUT_COPY_BODY(memcpy_func)
+__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size)
 
 static inline unsigned long
 __output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
 		const void *buf, unsigned long len)
-__DEFINE_OUTPUT_COPY_BODY(copy_func)
+{
+	unsigned long orig_len = len;
+	__DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf,
+				  orig_len - len, size)
+}
 
 static inline unsigned long
 memcpy_common(void *dst, const void *src, unsigned long n)
-- 
cgit v1.2.3


From 96ae52279594470622ff0585621a13e96b700600 Mon Sep 17 00:00:00 2001
From: Sargun Dhillon <sargun@sargun.me>
Date: Mon, 25 Jul 2016 05:54:46 -0700
Subject: bpf: Add bpf_probe_write_user BPF helper to be called in tracers

This allows user memory to be written to during the course of a kprobe.
It shouldn't be used to implement any kind of security mechanism
because of TOC-TOU attacks, but rather to debug, divert, and
manipulate execution of semi-cooperative processes.

Although it uses probe_kernel_write, we limit the address space
the probe can write into by checking the space with access_ok.
We do this as opposed to calling copy_to_user directly, in order
to avoid sleeping. In addition we ensure the threads's current fs
/ segment is USER_DS and the thread isn't exiting nor a kernel thread.

Given this feature is meant for experiments, and it has a risk of
crashing the system, and running programs, we print a warning on
when a proglet that attempts to use this helper is installed,
along with the pid and process name.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a12bbd32c0a6..b20438fdb029 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *unsafe_ptr = (void *) (long) r1;
+	void *src = (void *) (long) r2;
+	int size = (int) r3;
+
+	/*
+	 * Ensure we're in user context which is safe for the helper to
+	 * run. This helper has no business in a kthread.
+	 *
+	 * access_ok() should prevent writing to non-user memory, but in
+	 * some situations (nommu, temporary switch, etc) access_ok() does
+	 * not provide enough validation, hence the check on KERNEL_DS.
+	 */
+
+	if (unlikely(in_interrupt() ||
+		     current->flags & (PF_KTHREAD | PF_EXITING)))
+		return -EPERM;
+	if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+		return -EPERM;
+	if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
+		return -EPERM;
+
+	return probe_kernel_write(unsafe_ptr, src, size);
+}
+
+static const struct bpf_func_proto bpf_probe_write_user_proto = {
+	.func		= bpf_probe_write_user,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
+{
+	pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
+			    current->comm, task_pid_nr(current));
+
+	return &bpf_probe_write_user_proto;
+}
+
 /*
  * limited trace_printk()
  * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
@@ -362,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
+	case BPF_FUNC_probe_write_user:
+		return bpf_get_probe_write_proto();
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3