From c4675f935399cbdd3ba3869b0bf6c60528c8111a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 13 Jul 2015 20:49:32 +0200
Subject: ebpf: remove self-assignment in interpreter's tail call

ARG1 = BPF_R1 as it stands, evaluates to regs[BPF_REG_1] = regs[BPF_REG_1]
and thus has no effect. Add a comment instead, explaining what happens and
why it's okay to just remove it. Since from user space side, a tail call is
invoked as a pseudo helper function via bpf_tail_call_proto, the verifier
checks the arguments just like with any other helper function and makes
sure that the first argument (regs[BPF_REG_1])'s type is ARG_PTR_TO_CTX.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c5bedc82bc1c..bf38f5e8196c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -453,7 +453,11 @@ select_insn:
 		if (unlikely(!prog))
 			goto out;
 
-		ARG1 = BPF_R1;
+		/* ARG1 at this point is guaranteed to point to CTX from
+		 * the verifier side due to the fact that the tail call is
+		 * handeled like a helper, that is, bpf_tail_call_proto,
+		 * where arg1_type is ARG_PTR_TO_CTX.
+		 */
 		insn = prog->insnsi;
 		goto select_insn;
 out:
-- 
cgit v1.2.3


From 4d9c5c53ac99e4cb5d031897863203d7817b36e0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 20 Jul 2015 20:34:19 -0700
Subject: test_bpf: add bpf_skb_vlan_push/pop() tests

improve accuracy of timing in test_bpf and add two stress tests:
- {skb->data[0], get_smp_processor_id} repeated 2k times
- {skb->data[0], vlan_push} x 68 followed by {skb->data[0], vlan_pop} x 68

1st test is useful to test performance of JIT implementation of BPF_LD_ABS
together with BPF_CALL instructions.
2nd test is stressing skb_vlan_push/pop logic together with skb->data access
via BPF_LD_ABS insn which checks that re-caching of skb->data is done correctly.

In order to call bpf_skb_vlan_push() from test_bpf.ko have to add
three export_symbol_gpl.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index bf38f5e8196c..fafa74161445 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -177,6 +177,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
 	return 0;
 }
+EXPORT_SYMBOL_GPL(__bpf_call_base);
 
 /**
  *	__bpf_prog_run - run eBPF program on a given context
-- 
cgit v1.2.3


From 24b4d2abd0bd628f396dada3e915d395cbf459eb Mon Sep 17 00:00:00 2001
From: Alex Gartrell <agartrell@fb.com>
Date: Thu, 23 Jul 2015 14:24:40 -0700
Subject: ebpf: Allow dereferences of PTR_TO_STACK registers

mov %rsp, %r1           ; r1 = rsp
        add $-8, %r1            ; r1 = rsp - 8
        store_q $123, -8(%rsp)  ; *(u64*)r1 = 123  <- valid
        store_q $123, (%r1)     ; *(u64*)r1 = 123  <- previously invalid
        mov $0, %r0
        exit                    ; Always need to exit

And we'd get the following error:

	0: (bf) r1 = r10
	1: (07) r1 += -8
	2: (7a) *(u64 *)(r10 -8) = 999
	3: (7a) *(u64 *)(r1 +0) = 999
	R1 invalid mem access 'fp'

	Unable to load program

We already know that a register is a stack address and the appropriate
offset, so we should be able to validate those references as well.

Signed-off-by: Alex Gartrell <agartrell@fb.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866fd36a..cd307df98cb3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -648,6 +648,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 	struct verifier_state *state = &env->cur_state;
 	int size, err = 0;
 
+	if (state->regs[regno].type == PTR_TO_STACK)
+		off += state->regs[regno].imm;
+
 	size = bpf_size_to_bytes(bpf_size);
 	if (size < 0)
 		return size;
@@ -667,7 +670,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
 
-	} else if (state->regs[regno].type == FRAME_PTR) {
+	} else if (state->regs[regno].type == FRAME_PTR ||
+		   state->regs[regno].type == PTR_TO_STACK) {
 		if (off >= 0 || off < -MAX_BPF_STACK) {
 			verbose("invalid stack off=%d size=%d\n", off, size);
 			return -EACCES;
-- 
cgit v1.2.3


From ffe8690c85b8426db7783064724d106702f1b1e8 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:32 +0000
Subject: perf: add the necessary core perf APIs when accessing events counters
 in eBPF programs

This patch add three core perf APIs:
 - perf_event_attrs(): export the struct perf_event_attr from struct
   perf_event;
 - perf_event_get(): get the struct perf_event from the given fd;
 - perf_event_read_local(): read the events counters active on the
   current CPU;
These APIs are needed when accessing events counters in eBPF programs.

The API perf_event_read_local() comes from Peter and I add the
corresponding SOB.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/events/core.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae3419b99..e2c6a8886d4d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3212,6 +3212,59 @@ static inline u64 perf_event_count(struct perf_event *event)
 	return __perf_event_count(event);
 }
 
+/*
+ * NMI-safe method to read a local event, that is an event that
+ * is:
+ *   - either for the current task, or for this CPU
+ *   - does not have inherit set, for inherited task events
+ *     will not be local and we cannot read them atomically
+ *   - must not have a pmu::count method
+ */
+u64 perf_event_read_local(struct perf_event *event)
+{
+	unsigned long flags;
+	u64 val;
+
+	/*
+	 * Disabling interrupts avoids all counter scheduling (context
+	 * switches, timer based rotation and IPIs).
+	 */
+	local_irq_save(flags);
+
+	/* If this is a per-task event, it must be for current */
+	WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
+		     event->hw.target != current);
+
+	/* If this is a per-CPU event, it must be for this CPU */
+	WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
+		     event->cpu != smp_processor_id());
+
+	/*
+	 * It must not be an event with inherit set, we cannot read
+	 * all child counters from atomic context.
+	 */
+	WARN_ON_ONCE(event->attr.inherit);
+
+	/*
+	 * It must not have a pmu::count method, those are not
+	 * NMI safe.
+	 */
+	WARN_ON_ONCE(event->pmu->count);
+
+	/*
+	 * If the event is currently on this CPU, its either a per-task event,
+	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
+	 * oncpu == -1).
+	 */
+	if (event->oncpu == smp_processor_id())
+		event->pmu->read(event);
+
+	val = local64_read(&event->count);
+	local_irq_restore(flags);
+
+	return val;
+}
+
 static u64 perf_event_read(struct perf_event *event)
 {
 	/*
@@ -8574,6 +8627,31 @@ void perf_event_delayed_put(struct task_struct *task)
 		WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
 }
 
+struct perf_event *perf_event_get(unsigned int fd)
+{
+	int err;
+	struct fd f;
+	struct perf_event *event;
+
+	err = perf_fget_light(fd, &f);
+	if (err)
+		return ERR_PTR(err);
+
+	event = f.file->private_data;
+	atomic_long_inc(&event->refcount);
+	fdput(f);
+
+	return event;
+}
+
+const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+	if (!event)
+		return ERR_PTR(-EINVAL);
+
+	return &event->attr;
+}
+
 /*
  * inherit a event from parent task to child task:
  */
-- 
cgit v1.2.3


From 2a36f0b92eb638dd023870574eb471b1c56be9ad Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Thu, 6 Aug 2015 07:02:33 +0000
Subject: bpf: Make the bpf_prog_array_map more generic

All the map backends are of generic nature. In order to avoid
adding much special code into the eBPF core, rewrite part of
the bpf_prog_array map code and make it more generic. So the
new perf_event_array map type can reuse most of code with
bpf_prog_array map and add fewer lines of special code.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c | 80 +++++++++++++++++++++++++++++++--------------------
 kernel/bpf/core.c     |  2 +-
 kernel/bpf/syscall.c  |  2 +-
 3 files changed, 51 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229a6fa4..45df6572ecfd 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -150,15 +150,15 @@ static int __init register_array_map(void)
 }
 late_initcall(register_array_map);
 
-static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
 {
-	/* only bpf_prog file descriptors can be stored in prog_array map */
+	/* only file descriptors can be stored in this type of map */
 	if (attr->value_size != sizeof(u32))
 		return ERR_PTR(-EINVAL);
 	return array_map_alloc(attr);
 }
 
-static void prog_array_map_free(struct bpf_map *map)
+static void fd_array_map_free(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)
 
 	/* make sure it's empty */
 	for (i = 0; i < array->map.max_entries; i++)
-		BUG_ON(array->prog[i] != NULL);
+		BUG_ON(array->ptrs[i] != NULL);
 	kvfree(array);
 }
 
-static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	return NULL;
 }
 
 /* only called from syscall */
-static int prog_array_map_update_elem(struct bpf_map *map, void *key,
-				      void *value, u64 map_flags)
+static int fd_array_map_update_elem(struct bpf_map *map, void *key,
+				    void *value, u64 map_flags)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
-	struct bpf_prog *prog, *old_prog;
+	void *new_ptr, *old_ptr;
 	u32 index = *(u32 *)key, ufd;
 
 	if (map_flags != BPF_ANY)
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key,
 		return -E2BIG;
 
 	ufd = *(u32 *)value;
-	prog = bpf_prog_get(ufd);
-	if (IS_ERR(prog))
-		return PTR_ERR(prog);
-
-	if (!bpf_prog_array_compatible(array, prog)) {
-		bpf_prog_put(prog);
-		return -EINVAL;
-	}
+	new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+	if (IS_ERR(new_ptr))
+		return PTR_ERR(new_ptr);
 
-	old_prog = xchg(array->prog + index, prog);
-	if (old_prog)
-		bpf_prog_put_rcu(old_prog);
+	old_ptr = xchg(array->ptrs + index, new_ptr);
+	if (old_ptr)
+		map->ops->map_fd_put_ptr(old_ptr);
 
 	return 0;
 }
 
-static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
-	struct bpf_prog *old_prog;
+	void *old_ptr;
 	u32 index = *(u32 *)key;
 
 	if (index >= array->map.max_entries)
 		return -E2BIG;
 
-	old_prog = xchg(array->prog + index, NULL);
-	if (old_prog) {
-		bpf_prog_put_rcu(old_prog);
+	old_ptr = xchg(array->ptrs + index, NULL);
+	if (old_ptr) {
+		map->ops->map_fd_put_ptr(old_ptr);
 		return 0;
 	} else {
 		return -ENOENT;
 	}
 }
 
+static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *prog = bpf_prog_get(fd);
+	if (IS_ERR(prog))
+		return prog;
+
+	if (!bpf_prog_array_compatible(array, prog)) {
+		bpf_prog_put(prog);
+		return ERR_PTR(-EINVAL);
+	}
+	return prog;
+}
+
+static void prog_fd_array_put_ptr(void *ptr)
+{
+	struct bpf_prog *prog = ptr;
+
+	bpf_prog_put_rcu(prog);
+}
+
 /* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_prog_array_map_clear(struct bpf_map *map)
+void bpf_fd_array_map_clear(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
 
 	for (i = 0; i < array->map.max_entries; i++)
-		prog_array_map_delete_elem(map, &i);
+		fd_array_map_delete_elem(map, &i);
 }
 
 static const struct bpf_map_ops prog_array_ops = {
-	.map_alloc = prog_array_map_alloc,
-	.map_free = prog_array_map_free,
+	.map_alloc = fd_array_map_alloc,
+	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
-	.map_lookup_elem = prog_array_map_lookup_elem,
-	.map_update_elem = prog_array_map_update_elem,
-	.map_delete_elem = prog_array_map_delete_elem,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_update_elem = fd_array_map_update_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = prog_fd_array_get_ptr,
+	.map_fd_put_ptr = prog_fd_array_put_ptr,
 };
 
 static struct bpf_map_type_list prog_array_type __read_mostly = {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index fafa74161445..67c380cfa9ca 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -450,7 +450,7 @@ select_insn:
 
 		tail_call_cnt++;
 
-		prog = READ_ONCE(array->prog[index]);
+		prog = READ_ONCE(array->ptrs[index]);
 		if (unlikely(!prog))
 			goto out;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1b14d197a4f..dc9b464fefa9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 		/* prog_array stores refcnt-ed bpf_prog pointers
 		 * release them all when user space closes prog_array_fd
 		 */
-		bpf_prog_array_map_clear(map);
+		bpf_fd_array_map_clear(map);
 
 	bpf_map_put(map);
 	return 0;
-- 
cgit v1.2.3


From ea317b267e9d03a8241893aa176fba7661d07579 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:34 +0000
Subject: bpf: Add new bpf map type to store the pointer to struct perf_event

Introduce a new bpf map type 'BPF_MAP_TYPE_PERF_EVENT_ARRAY'.
This map only stores the pointer to struct perf_event. The
user space event FDs from perf_event_open() syscall are converted
to the pointer to struct perf_event and stored in map.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 45df6572ecfd..29ace107f236 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -273,3 +273,60 @@ static int __init register_prog_array_map(void)
 	return 0;
 }
 late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+	bpf_fd_array_map_clear(map);
+	fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+	struct perf_event *event;
+	const struct perf_event_attr *attr;
+
+	event = perf_event_get(fd);
+	if (IS_ERR(event))
+		return event;
+
+	attr = perf_event_attrs(event);
+	if (IS_ERR(attr))
+		return (void *)attr;
+
+	if (attr->type != PERF_TYPE_RAW &&
+	    attr->type != PERF_TYPE_HARDWARE) {
+		perf_event_release_kernel(event);
+		return ERR_PTR(-EINVAL);
+	}
+	return event;
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+	struct perf_event *event = ptr;
+
+	perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+	.map_alloc = fd_array_map_alloc,
+	.map_free = perf_event_array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = fd_array_map_lookup_elem,
+	.map_update_elem = fd_array_map_update_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
+	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+	.ops = &perf_event_array_ops,
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+	bpf_register_map_type(&perf_event_array_type);
+	return 0;
+}
+late_initcall(register_perf_event_array_map);
-- 
cgit v1.2.3


From 35578d7984003097af2b1e34502bc943d40c1804 Mon Sep 17 00:00:00 2001
From: Kaixu Xia <xiakaixu@huawei.com>
Date: Thu, 6 Aug 2015 07:02:35 +0000
Subject: bpf: Implement function bpf_perf_event_read() that get the selected
 hardware PMU conuter

According to the perf_event_map_fd and index, the function
bpf_perf_event_read() can convert the corresponding map
value to the pointer to struct perf_event and return the
Hardware PMU counter value.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c    | 48 +++++++++++++++++++++++++++++++++---------------
 kernel/trace/bpf_trace.c | 31 +++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cd307df98cb3..48e1c7192560 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = {
 	[CONST_IMM]		= "imm",
 };
 
+static const struct {
+	int map_type;
+	int func_id;
+} func_limit[] = {
+	{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
+	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+};
+
 static void print_verifier_state(struct verifier_env *env)
 {
 	enum bpf_reg_type t;
@@ -837,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 	return err;
 }
 
+static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+{
+	bool bool_map, bool_func;
+	int i;
+
+	if (!map)
+		return 0;
+
+	for (i = 0; i <= ARRAY_SIZE(func_limit); i++) {
+		bool_map = (map->map_type == func_limit[i].map_type);
+		bool_func = (func_id == func_limit[i].func_id);
+		/* only when map & func pair match it can continue.
+		 * don't allow any other map type to be passed into
+		 * the special func;
+		 */
+		if (bool_map != bool_func)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 {
 	struct verifier_state *state = &env->cur_state;
@@ -912,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id)
 		return -EINVAL;
 	}
 
-	if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
-	    func_id != BPF_FUNC_tail_call)
-		/* prog_array map type needs extra care:
-		 * only allow to pass it into bpf_tail_call() for now.
-		 * bpf_map_delete_elem() can be allowed in the future,
-		 * while bpf_map_update_elem() must only be done via syscall
-		 */
-		return -EINVAL;
-
-	if (func_id == BPF_FUNC_tail_call &&
-	    map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
-		/* don't allow any other map type to be passed into
-		 * bpf_tail_call()
-		 */
-		return -EINVAL;
+	err = check_map_func_compatibility(map, func_id);
+	if (err)
+		return err;
 
 	return 0;
 }
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041adee90..ef9936df1b04 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -158,6 +158,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+	struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct perf_event *event;
+
+	if (unlikely(index >= array->map.max_entries))
+		return -E2BIG;
+
+	event = (struct perf_event *)array->ptrs[index];
+	if (!event)
+		return -ENOENT;
+
+	/*
+	 * we don't know if the function is run successfully by the
+	 * return value. It can be judged in other places, such as
+	 * eBPF programs.
+	 */
+	return perf_event_read_local(event);
+}
+
+const struct bpf_func_proto bpf_perf_event_read_proto = {
+	.func		= bpf_perf_event_read,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -183,6 +212,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return bpf_get_trace_printk_proto();
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_perf_event_read:
+		return &bpf_perf_event_read_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 140d8b335a9beb234fd0ed9a15aa6a47f47fd771 Mon Sep 17 00:00:00 2001
From: Wei-Chun Chao <weichunc@plumgrid.com>
Date: Wed, 12 Aug 2015 07:57:12 -0700
Subject: bpf: fix bpf_perf_event_read() loop upper bound

Verifier rejects programs incorrectly.

Fixes: 35578d798400 ("bpf: Implement function bpf_perf_event_read()")
Cc: Kaixu Xia <xiakaixu@huawei.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: Wei-Chun Chao <weichunc@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 48e1c7192560..ed12e385fb75 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -853,7 +853,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 	if (!map)
 		return 0;
 
-	for (i = 0; i <= ARRAY_SIZE(func_limit); i++) {
+	for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
 		bool_map = (map->map_type == func_limit[i].map_type);
 		bool_func = (func_id == func_limit[i].func_id);
 		/* only when map & func pair match it can continue.
-- 
cgit v1.2.3


From 1a6877b9c0c2ad901d4335d909432d3bb6d3a330 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 28 Aug 2015 15:56:22 -0700
Subject: lib: introduce strncpy_from_unsafe()

generalize FETCH_FUNC_NAME(memory, string) into
strncpy_from_unsafe() and fix sparse warnings that were
present in original implementation.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/trace_kprobe.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b7d0cdd9906c..c9956440d0e6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
 static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 					    void *addr, void *dest)
 {
-	long ret;
 	int maxlen = get_rloc_len(*(u32 *)dest);
 	u8 *dst = get_rloc_data(dest);
-	u8 *src = addr;
-	mm_segment_t old_fs = get_fs();
+	long ret;
 
 	if (!maxlen)
 		return;
@@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
 	 * Try to get string again, since the string can be changed while
 	 * probing.
 	 */
-	set_fs(KERNEL_DS);
-	pagefault_disable();
-
-	do
-		ret = __copy_from_user_inatomic(dst++, src++, 1);
-	while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
-
-	dst[-1] = '\0';
-	pagefault_enable();
-	set_fs(old_fs);
+	ret = strncpy_from_unsafe(dst, addr, maxlen);
 
 	if (ret < 0) {	/* Failed to fetch string */
-		((u8 *)get_rloc_data(dest))[0] = '\0';
+		dst[0] = '\0';
 		*(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
 	} else {
-		*(u32 *)dest = make_data_rloc(src - (u8 *)addr,
-					      get_rloc_offs(*(u32 *)dest));
+		*(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
 	}
 }
 NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
-- 
cgit v1.2.3


From 8d3b7dce8622919da5c5822ef7338d6604c9fe6e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 28 Aug 2015 15:56:23 -0700
Subject: bpf: add support for %s specifier to bpf_trace_printk()

%s specifier makes bpf program and kernel debugging easier.
To make sure that trace_printk won't crash the unsafe string
is copied into stack and unsafe pointer is substituted.

The following C program:
 #include <linux/fs.h>
int foo(struct pt_regs *ctx, struct filename *filename)
{
  void *name = 0;

  bpf_probe_read(&name, sizeof(name), &filename->name);
  bpf_trace_printk("executed %s\n", name);
  return 0;
}

when attached to kprobe do_execve()
will produce output in /sys/kernel/debug/tracing/trace_pipe :
    make-13492 [002] d..1  3250.997277: : executed /bin/sh
      sh-13493 [004] d..1  3250.998716: : executed /usr/bin/gcc
     gcc-13494 [002] d..1  3250.999822: : executed /usr/lib/gcc/x86_64-linux-gnu/4.7/cc1
     gcc-13495 [002] d..1  3251.006731: : executed /usr/bin/as
     gcc-13496 [002] d..1  3251.011831: : executed /usr/lib/gcc/x86_64-linux-gnu/4.7/collect2
collect2-13497 [000] d..1  3251.012941: : executed /usr/bin/ld

Suggested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ef9936df1b04..0fe96c7c8803 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,13 +81,16 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 
 /*
  * limited trace_printk()
- * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
  */
 static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
 {
 	char *fmt = (char *) (long) r1;
+	bool str_seen = false;
 	int mod[3] = {};
 	int fmt_cnt = 0;
+	u64 unsafe_addr;
+	char buf[64];
 	int i;
 
 	/*
@@ -114,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
 		if (fmt[i] == 'l') {
 			mod[fmt_cnt]++;
 			i++;
-		} else if (fmt[i] == 'p') {
+		} else if (fmt[i] == 'p' || fmt[i] == 's') {
 			mod[fmt_cnt]++;
 			i++;
 			if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
 				return -EINVAL;
 			fmt_cnt++;
+			if (fmt[i - 1] == 's') {
+				if (str_seen)
+					/* allow only one '%s' per fmt string */
+					return -EINVAL;
+				str_seen = true;
+
+				switch (fmt_cnt) {
+				case 1:
+					unsafe_addr = r3;
+					r3 = (long) buf;
+					break;
+				case 2:
+					unsafe_addr = r4;
+					r4 = (long) buf;
+					break;
+				case 3:
+					unsafe_addr = r5;
+					r5 = (long) buf;
+					break;
+				}
+				buf[0] = 0;
+				strncpy_from_unsafe(buf,
+						    (void *) (long) unsafe_addr,
+						    sizeof(buf));
+			}
 			continue;
 		}
 
-- 
cgit v1.2.3