From 61d1b6a42fec61c5065f54cc62cef02b483c69fb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Jun 2016 22:47:12 +0200 Subject: bpf, maps: add release callback Add a release callback for maps that is invoked when the last reference to its struct file is gone and the struct file about to be released by vfs. The handler will be used by fd array maps. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 46ecce4b79ed..fc3adcd064b1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -124,7 +124,12 @@ void bpf_map_put_with_uref(struct bpf_map *map) static int bpf_map_release(struct inode *inode, struct file *filp) { - bpf_map_put_with_uref(filp->private_data); + struct bpf_map *map = filp->private_data; + + if (map->ops->map_release) + map->ops->map_release(map, filp); + + bpf_map_put_with_uref(map); return 0; } -- cgit v1.2.3 From d056a788765e67773124f520159185bc89f5d1ad Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Jun 2016 22:47:13 +0200 Subject: bpf, maps: extend map_fd_get_ptr arguments This patch extends map_fd_get_ptr() callback that is used by fd array maps, so that struct file pointer from the related map can be passed in. It's safe to remove map_update_elem() callback for the two maps since this is only allowed from syscall side, but not from eBPF programs for these two map types. Like in per-cpu map case, bpf_fd_array_map_update_elem() needs to be called directly here due to the extra argument. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 16 +++++++++------- kernel/bpf/syscall.c | 6 ++++++ 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 76d5a794e426..bfedcbdb4d84 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -328,8 +328,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) } /* only called from syscall */ -static int fd_array_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *new_ptr, *old_ptr; @@ -342,7 +342,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key, return -E2BIG; ufd = *(u32 *)value; - new_ptr = map->ops->map_fd_get_ptr(map, ufd); + new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); if (IS_ERR(new_ptr)) return PTR_ERR(new_ptr); @@ -371,10 +371,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) } } -static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) +static void *prog_fd_array_get_ptr(struct bpf_map *map, + struct file *map_file, int fd) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog = bpf_prog_get(fd); + if (IS_ERR(prog)) return prog; @@ -382,6 +384,7 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) bpf_prog_put(prog); return ERR_PTR(-EINVAL); } + return prog; } @@ -407,7 +410,6 @@ static const struct bpf_map_ops prog_array_ops = { .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, - .map_update_elem = fd_array_map_update_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, @@ -431,7 +433,8 @@ static void perf_event_array_map_free(struct bpf_map *map) fd_array_map_free(map); } -static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +static void *perf_event_fd_array_get_ptr(struct bpf_map *map, + struct file *map_file, int fd) { struct perf_event *event; const struct perf_event_attr *attr; @@ -474,7 +477,6 @@ static const struct bpf_map_ops perf_event_array_ops = { .map_free = perf_event_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, - .map_update_elem = fd_array_map_update_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fc3adcd064b1..c23a4e9311b3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -392,6 +392,12 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || + map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + rcu_read_lock(); + err = bpf_fd_array_map_update_elem(map, f.file, key, value, + attr->flags); + rcu_read_unlock(); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); -- cgit v1.2.3 From 3b1efb196eee45b2f0c4994e0c43edb5e367f620 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Jun 2016 22:47:14 +0200 Subject: bpf, maps: flush own entries on perf map release The behavior of perf event arrays are quite different from all others as they are tightly coupled to perf event fds, f.e. shown recently by commit e03e7ee34fdd ("perf/bpf: Convert perf_event_array to use struct file") to make refcounting on perf event more robust. A remaining issue that the current code still has is that since additions to the perf event array take a reference on the struct file via perf_event_get() and are only released via fput() (that cleans up the perf event eventually via perf_event_release_kernel()) when the element is either manually removed from the map from user space or automatically when the last reference on the perf event map is dropped. However, this leads us to dangling struct file's when the map gets pinned after the application owning the perf event descriptor exits, and since the struct file reference will in such case only be manually dropped or via pinned file removal, it leads to the perf event living longer than necessary, consuming needlessly resources for that time. Relations between perf event fds and bpf perf event map fds can be rather complex. F.e. maps can act as demuxers among different perf event fds that can possibly be owned by different threads and based on the index selection from the program, events get dispatched to one of the per-cpu fd endpoints. One perf event fd (or, rather a per-cpu set of them) can also live in multiple perf event maps at the same time, listening for events. Also, another requirement is that perf event fds can get closed from application side after they have been attached to the perf event map, so that on exit perf event map will take care of dropping their references eventually. Likewise, when such maps are pinned, the intended behavior is that a user application does bpf_obj_get(), puts its fds in there and on exit when fd is released, they are dropped from the map again, so the map acts rather as connector endpoint. This also makes perf event maps inherently different from program arrays as described in more detail in commit c9da161c6517 ("bpf: fix clearing on persistent program array maps"). To tackle this, map entries are marked by the map struct file that added the element to the map. And when the last reference to that map struct file is released from user space, then the tracked entries are purged from the map. This is okay, because new map struct files instances resp. frontends to the anon inode are provided via bpf_map_new_fd() that is called when we invoke bpf_obj_get_user() for retrieving a pinned map, but also when an initial instance is created via map_create(). The rest is resolved by the vfs layer automatically for us by keeping reference count on the map's struct file. Any concurrent updates on the map slot are fine as well, it just means that perf_event_fd_array_release() needs to delete less of its own entires. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 102 ++++++++++++++++++++++++++++++++++------------- kernel/trace/bpf_trace.c | 18 ++++----- 2 files changed, 82 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index bfedcbdb4d84..5af30732697b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -427,59 +427,105 @@ static int __init register_prog_array_map(void) } late_initcall(register_prog_array_map); -static void perf_event_array_map_free(struct bpf_map *map) +static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, + struct file *map_file) { - bpf_fd_array_map_clear(map); - fd_array_map_free(map); + struct bpf_event_entry *ee; + + ee = kzalloc(sizeof(*ee), GFP_KERNEL); + if (ee) { + ee->event = perf_file->private_data; + ee->perf_file = perf_file; + ee->map_file = map_file; + } + + return ee; +} + +static void __bpf_event_entry_free(struct rcu_head *rcu) +{ + struct bpf_event_entry *ee; + + ee = container_of(rcu, struct bpf_event_entry, rcu); + fput(ee->perf_file); + kfree(ee); +} + +static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) +{ + call_rcu(&ee->rcu, __bpf_event_entry_free); } static void *perf_event_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { - struct perf_event *event; const struct perf_event_attr *attr; - struct file *file; + struct bpf_event_entry *ee; + struct perf_event *event; + struct file *perf_file; - file = perf_event_get(fd); - if (IS_ERR(file)) - return file; + perf_file = perf_event_get(fd); + if (IS_ERR(perf_file)) + return perf_file; - event = file->private_data; + event = perf_file->private_data; + ee = ERR_PTR(-EINVAL); attr = perf_event_attrs(event); - if (IS_ERR(attr)) - goto err; - - if (attr->inherit) - goto err; - - if (attr->type == PERF_TYPE_RAW) - return file; - - if (attr->type == PERF_TYPE_HARDWARE) - return file; + if (IS_ERR(attr) || attr->inherit) + goto err_out; + + switch (attr->type) { + case PERF_TYPE_SOFTWARE: + if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) + goto err_out; + /* fall-through */ + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + ee = bpf_event_entry_gen(perf_file, map_file); + if (ee) + return ee; + ee = ERR_PTR(-ENOMEM); + /* fall-through */ + default: + break; + } - if (attr->type == PERF_TYPE_SOFTWARE && - attr->config == PERF_COUNT_SW_BPF_OUTPUT) - return file; -err: - fput(file); - return ERR_PTR(-EINVAL); +err_out: + fput(perf_file); + return ee; } static void perf_event_fd_array_put_ptr(void *ptr) { - fput((struct file *)ptr); + bpf_event_entry_free_rcu(ptr); +} + +static void perf_event_fd_array_release(struct bpf_map *map, + struct file *map_file) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_event_entry *ee; + int i; + + rcu_read_lock(); + for (i = 0; i < array->map.max_entries; i++) { + ee = READ_ONCE(array->ptrs[i]); + if (ee && ee->map_file == map_file) + fd_array_map_delete_elem(map, &i); + } + rcu_read_unlock(); } static const struct bpf_map_ops perf_event_array_ops = { .map_alloc = fd_array_map_alloc, - .map_free = perf_event_array_map_free, + .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = perf_event_fd_array_get_ptr, .map_fd_put_ptr = perf_event_fd_array_put_ptr, + .map_release = perf_event_fd_array_release, }; static struct bpf_map_type_list perf_event_array_type __read_mostly = { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 720b7bb01d43..037ea6ea3cb2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -192,18 +192,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) { struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); + struct bpf_event_entry *ee; struct perf_event *event; - struct file *file; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = READ_ONCE(array->ptrs[index]); - if (unlikely(!file)) + ee = READ_ONCE(array->ptrs[index]); + if (unlikely(!ee)) return -ENOENT; - event = file->private_data; - + event = ee->event; /* make sure event is local and doesn't have pmu::count */ if (event->oncpu != smp_processor_id() || event->pmu->count) @@ -233,8 +232,8 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) u64 index = flags & BPF_F_INDEX_MASK; void *data = (void *) (long) r4; struct perf_sample_data sample_data; + struct bpf_event_entry *ee; struct perf_event *event; - struct file *file; struct perf_raw_record raw = { .size = size, .data = data, @@ -247,12 +246,11 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = READ_ONCE(array->ptrs[index]); - if (unlikely(!file)) + ee = READ_ONCE(array->ptrs[index]); + if (unlikely(!ee)) return -ENOENT; - event = file->private_data; - + event = ee->event; if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; -- cgit v1.2.3 From 1ca1cc98bf7418c680415bfce05699f67510a7fd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:23 +0200 Subject: bpf: minor cleanups on fd maps and helpers Some minor cleanups: i) Remove the unlikely() from fd array map lookups and let the CPU branch predictor do its job, scenarios where there is not always a map entry are very well valid. ii) Move the attribute type check in the bpf_perf_event_read() helper a bit earlier so it's consistent wrt checks with bpf_perf_event_output() helper as well. iii) remove some comments that are self-documenting in kprobe_prog_is_valid_access() and therefore make it consistent to tp_prog_is_valid_access() as well. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 3 +-- kernel/trace/bpf_trace.c | 18 ++++++------------ 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b94a36550591..d638062f66d6 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -719,14 +719,13 @@ select_insn: if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; prog = READ_ONCE(array->ptrs[index]); - if (unlikely(!prog)) + if (!prog) goto out; /* ARG1 at this point is guaranteed to point to CTX from diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3de25fbed785..4e61f74a5d73 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -199,19 +199,19 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) return -E2BIG; ee = READ_ONCE(array->ptrs[index]); - if (unlikely(!ee)) + if (!ee) return -ENOENT; event = ee->event; + if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && + event->attr.type != PERF_TYPE_RAW)) + return -EINVAL; + /* make sure event is local and doesn't have pmu::count */ if (event->oncpu != smp_processor_id() || event->pmu->count) return -EINVAL; - if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && - event->attr.type != PERF_TYPE_RAW)) - return -EINVAL; - /* * we don't know if the function is run successfully by the * return value. It can be judged in other places, such as @@ -251,7 +251,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) return -E2BIG; ee = READ_ONCE(array->ptrs[index]); - if (unlikely(!ee)) + if (!ee) return -ENOENT; event = ee->event; @@ -354,18 +354,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type) { - /* check bounds */ if (off < 0 || off >= sizeof(struct pt_regs)) return false; - - /* only read is allowed */ if (type != BPF_READ) return false; - - /* disallow misaligned access */ if (off % size != 0) return false; - return true; } -- cgit v1.2.3 From d79313303181d357d293453fb8486bdc87bfd2f4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:24 +0200 Subject: bpf, trace: fetch current cpu only once We currently have two invocations, which is unnecessary. Fetch it only once and use the smp_processor_id() variant, so we also get preemption checks along with it when DEBUG_PREEMPT is set. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4e61f74a5d73..505f9e9cdb3b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -233,6 +233,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) struct pt_regs *regs = (struct pt_regs *) (long) r1; struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_array *array = container_of(map, struct bpf_array, map); + unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; void *data = (void *) (long) r4; struct perf_sample_data sample_data; @@ -246,7 +247,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; if (index == BPF_F_CURRENT_CPU) - index = raw_smp_processor_id(); + index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; @@ -259,7 +260,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; - if (unlikely(event->oncpu != smp_processor_id())) + if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; perf_sample_data_init(&sample_data, 0, 0); -- cgit v1.2.3 From 6816a7ffce32e999601825ddfd887f36d3052932 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:25 +0200 Subject: bpf, trace: add BPF_F_CURRENT_CPU flag for bpf_perf_event_read Follow-up commit to 1e33759c788c ("bpf, trace: add BPF_F_CURRENT_CPU flag for bpf_perf_event_output") to add the same functionality into bpf_perf_event_read() helper. The split of index into flags and index component is also safe here, since such large maps are rejected during map allocation time. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 505f9e9cdb3b..19c5b4a5c3eb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -188,13 +188,19 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) +static u64 bpf_perf_event_read(u64 r1, u64 flags, u64 r3, u64 r4, u64 r5) { struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); + unsigned int cpu = smp_processor_id(); + u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; struct perf_event *event; + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + if (index == BPF_F_CURRENT_CPU) + index = cpu; if (unlikely(index >= array->map.max_entries)) return -E2BIG; @@ -208,8 +214,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) return -EINVAL; /* make sure event is local and doesn't have pmu::count */ - if (event->oncpu != smp_processor_id() || - event->pmu->count) + if (unlikely(event->oncpu != cpu || event->pmu->count)) return -EINVAL; /* -- cgit v1.2.3 From 80b48c445797a634d869c7e5a53e182ba2688931 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 28 Jun 2016 12:18:26 +0200 Subject: bpf: don't use raw processor id in generic helper Use smp_processor_id() for the generic helper bpf_get_smp_processor_id() instead of the raw variant. This allows for preemption checks when we have DEBUG_PREEMPT, and otherwise uses the raw variant anyway. We only need to keep the raw variant for socket filters, but we can reuse the helper that is already there from cBPF side. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ad7a0573f71b..1ea3afba1a4f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -101,7 +101,7 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { - return raw_smp_processor_id(); + return smp_processor_id(); } const struct bpf_func_proto bpf_get_smp_processor_id_proto = { -- cgit v1.2.3 From 1aacde3d22c42281236155c1ef6d7a5aa32a826b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 30 Jun 2016 17:24:43 +0200 Subject: bpf: generally move prog destruction to RCU deferral Jann Horn reported following analysis that could potentially result in a very hard to trigger (if not impossible) UAF race, to quote his event timeline: - Set up a process with threads T1, T2 and T3 - Let T1 set up a socket filter F1 that invokes another filter F2 through a BPF map [tail call] - Let T1 trigger the socket filter via a unix domain socket write, don't wait for completion - Let T2 call PERF_EVENT_IOC_SET_BPF with F2, don't wait for completion - Now T2 should be behind bpf_prog_get(), but before bpf_prog_put() - Let T3 close the file descriptor for F2, dropping the reference count of F2 to 2 - At this point, T1 should have looked up F2 from the map, but not finished executing it - Let T3 remove F2 from the BPF map, dropping the reference count of F2 to 1 - Now T2 should call bpf_prog_put() (wrong BPF program type), dropping the reference count of F2 to 0 and scheduling bpf_prog_free_deferred() via schedule_work() - At this point, the BPF program could be freed - BPF execution is still running in a freed BPF program While at PERF_EVENT_IOC_SET_BPF time it's only guaranteed that the perf event fd we're doing the syscall on doesn't disappear from underneath us for whole syscall time, it may not be the case for the bpf fd used as an argument only after we did the put. It needs to be a valid fd pointing to a BPF program at the time of the call to make the bpf_prog_get() and while T2 gets preempted, F2 must have dropped reference to 1 on the other CPU. The fput() from the close() in T3 should also add additionally delay to the reference drop via exit_task_work() when bpf_prog_release() gets called as well as scheduling bpf_prog_free_deferred(). That said, it makes nevertheless sense to move the BPF prog destruction generally after RCU grace period to guarantee that such scenario above, but also others as recently fixed in ceb56070359b ("bpf, perf: delay release of BPF prog after grace period") with regards to tail calls won't happen. Integrating bpf_prog_free_deferred() directly into the RCU callback is not allowed since the invocation might happen from either softirq or process context, so we're not permitted to block. Reviewing all bpf_prog_put() invocations from eBPF side (note, cBPF -> eBPF progs don't use this for their destruction) with call_rcu() look good to me. Since we don't know whether at the time of attaching the program, we're already part of a tail call map, we need to use RCU variant. However, due to this, there won't be severely more stress on the RCU callback queue: situations with above bpf_prog_get() and bpf_prog_put() combo in practice normally won't lead to releases, but even if they would, enough effort/ cycles have to be put into loading a BPF program into the kernel already. Reported-by: Jann Horn Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 4 +--- kernel/bpf/syscall.c | 13 +++---------- kernel/events/core.c | 2 +- 3 files changed, 5 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 5af30732697b..4ec57a649b1f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -390,9 +390,7 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, static void prog_fd_array_put_ptr(void *ptr) { - struct bpf_prog *prog = ptr; - - bpf_prog_put_rcu(prog); + bpf_prog_put(ptr); } /* decrement refcnt of all bpf_progs that are stored in this map */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c23a4e9311b3..f6806a1d7ed9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -623,7 +623,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) free_uid(user); } -static void __prog_put_common(struct rcu_head *rcu) +static void __bpf_prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); @@ -632,17 +632,10 @@ static void __prog_put_common(struct rcu_head *rcu) bpf_prog_free(aux->prog); } -/* version of bpf_prog_put() that is called after a grace period */ -void bpf_prog_put_rcu(struct bpf_prog *prog) -{ - if (atomic_dec_and_test(&prog->aux->refcnt)) - call_rcu(&prog->aux->rcu, __prog_put_common); -} - void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) - __prog_put_common(&prog->aux->rcu); + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -650,7 +643,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) { struct bpf_prog *prog = filp->private_data; - bpf_prog_put_rcu(prog); + bpf_prog_put(prog); return 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 85cd41878a74..9c51ec3f0f44 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7529,7 +7529,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) prog = event->tp_event->prog; if (prog) { event->tp_event->prog = NULL; - bpf_prog_put_rcu(prog); + bpf_prog_put(prog); } } -- cgit v1.2.3 From 113214be7f6c98dd6d0435e4765aea8dea91662c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 30 Jun 2016 17:24:44 +0200 Subject: bpf: refactor bpf_prog_get and type check into helper Since bpf_prog_get() and program type check is used in a couple of places, refactor this into a small helper function that we can make use of. Since the non RO prog->aux part is not used in performance critical paths and a program destruction via RCU is rather very unlikley when doing the put, we shouldn't have an issue just doing the bpf_prog_get() + prog->type != type check, but actually not taking the ref at all (due to being in fdget() / fdput() section of the bpf fd) is even cleaner and makes the diff smaller as well, so just go for that. Callsites are changed to make use of the new helper where possible. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f6806a1d7ed9..22863d9872b1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -657,7 +657,7 @@ int bpf_prog_new_fd(struct bpf_prog *prog) O_RDWR | O_CLOEXEC); } -static struct bpf_prog *__bpf_prog_get(struct fd f) +static struct bpf_prog *____bpf_prog_get(struct fd f) { if (!f.file) return ERR_PTR(-EBADF); @@ -678,24 +678,35 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) return prog; } -/* called by sockets/tracing/seccomp before attaching program to an event - * pairs with bpf_prog_put() - */ -struct bpf_prog *bpf_prog_get(u32 ufd) +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { struct fd f = fdget(ufd); struct bpf_prog *prog; - prog = __bpf_prog_get(f); + prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; + if (type && prog->type != *type) { + prog = ERR_PTR(-EINVAL); + goto out; + } prog = bpf_prog_inc(prog); +out: fdput(f); - return prog; } -EXPORT_SYMBOL_GPL(bpf_prog_get); + +struct bpf_prog *bpf_prog_get(u32 ufd) +{ + return __bpf_prog_get(ufd, NULL); +} + +struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) +{ + return __bpf_prog_get(ufd, &type); +} +EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD kern_version -- cgit v1.2.3 From 1f3fe7ebf6136c341012db9f554d4caa566fcbaa Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 30 Jun 2016 10:28:42 -0700 Subject: cgroup: Add cgroup_get_from_fd Add a helper function to get a cgroup2 from a fd. It will be stored in a bpf array (BPF_MAP_TYPE_CGROUP_ARRAY) which will be introduced in the later patch. Signed-off-by: Martin KaFai Lau Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Tejun Heo Acked-by: Tejun Heo Signed-off-by: David S. Miller --- kernel/cgroup.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 75c0ff00aca6..50787cd61da2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -62,6 +62,7 @@ #include #include #include +#include #include /* @@ -6209,6 +6210,40 @@ struct cgroup *cgroup_get_from_path(const char *path) } EXPORT_SYMBOL_GPL(cgroup_get_from_path); +/** + * cgroup_get_from_fd - get a cgroup pointer from a fd + * @fd: fd obtained by open(cgroup2_dir) + * + * Find the cgroup from a fd which should be obtained + * by opening a cgroup directory. Returns a pointer to the + * cgroup on success. ERR_PTR is returned if the cgroup + * cannot be found. + */ +struct cgroup *cgroup_get_from_fd(int fd) +{ + struct cgroup_subsys_state *css; + struct cgroup *cgrp; + struct file *f; + + f = fget_raw(fd); + if (!f) + return ERR_PTR(-EBADF); + + css = css_tryget_online_from_dir(f->f_path.dentry, NULL); + fput(f); + if (IS_ERR(css)) + return ERR_CAST(css); + + cgrp = css->cgroup; + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_fd); + /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. -- cgit v1.2.3 From 4ed8ec521ed57c4e207ad464ca0388776de74d4b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 30 Jun 2016 10:28:43 -0700 Subject: cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY Add a BPF_MAP_TYPE_CGROUP_ARRAY and its bpf_map_ops's implementations. To update an element, the caller is expected to obtain a cgroup2 backed fd by open(cgroup2_dir) and then update the array with that fd. Signed-off-by: Martin KaFai Lau Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Tejun Heo Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 43 +++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 ++- kernel/bpf/verifier.c | 2 ++ 3 files changed, 47 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 4ec57a649b1f..db1a743e3db2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -537,3 +537,46 @@ static int __init register_perf_event_array_map(void) return 0; } late_initcall(register_perf_event_array_map); + +#ifdef CONFIG_SOCK_CGROUP_DATA +static void *cgroup_fd_array_get_ptr(struct bpf_map *map, + struct file *map_file /* not used */, + int fd) +{ + return cgroup_get_from_fd(fd); +} + +static void cgroup_fd_array_put_ptr(void *ptr) +{ + /* cgroup_put free cgrp after a rcu grace period */ + cgroup_put(ptr); +} + +static void cgroup_fd_array_free(struct bpf_map *map) +{ + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static const struct bpf_map_ops cgroup_array_ops = { + .map_alloc = fd_array_map_alloc, + .map_free = cgroup_fd_array_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = fd_array_map_lookup_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = cgroup_fd_array_get_ptr, + .map_fd_put_ptr = cgroup_fd_array_put_ptr, +}; + +static struct bpf_map_type_list cgroup_array_type __read_mostly = { + .ops = &cgroup_array_ops, + .type = BPF_MAP_TYPE_CGROUP_ARRAY, +}; + +static int __init register_cgroup_array_map(void) +{ + bpf_register_map_type(&cgroup_array_type); + return 0; +} +late_initcall(register_cgroup_array_map); +#endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 22863d9872b1..96d938a22050 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -393,7 +393,8 @@ static int map_update_elem(union bpf_attr *attr) } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || - map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + map->map_type == BPF_MAP_TYPE_PROG_ARRAY || + map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eec9f90ba030..69ba2251a22b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1035,6 +1035,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (func_id != BPF_FUNC_get_stackid) goto error; break; + case BPF_MAP_TYPE_CGROUP_ARRAY: + goto error; default: break; } -- cgit v1.2.3 From 4a482f34afcc162d8456f449b137ec2a95be60d8 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 30 Jun 2016 10:28:44 -0700 Subject: cgroup: bpf: Add bpf_skb_in_cgroup_proto Adds a bpf helper, bpf_skb_in_cgroup, to decide if a skb->sk belongs to a descendant of a cgroup2. It is similar to the feature added in netfilter: commit c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match") The user is expected to populate a BPF_MAP_TYPE_CGROUP_ARRAY which will be used by the bpf_skb_in_cgroup. Modifications to the bpf verifier is to ensure BPF_MAP_TYPE_CGROUP_ARRAY and bpf_skb_in_cgroup() are always used together. Signed-off-by: Martin KaFai Lau Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Tejun Heo Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 69ba2251a22b..e206c2181412 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1036,7 +1036,9 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_MAP_TYPE_CGROUP_ARRAY: - goto error; + if (func_id != BPF_FUNC_skb_in_cgroup) + goto error; + break; default: break; } @@ -1056,6 +1058,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; + case BPF_FUNC_skb_in_cgroup: + if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) + goto error; + break; default: break; } -- cgit v1.2.3 From 606274c5abd8e245add01bc7145a8cbb92b69ba8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 6 Jul 2016 22:38:36 -0700 Subject: bpf: introduce bpf_get_current_task() helper over time there were multiple requests to access different data structures and fields of task_struct current, so finally add the helper to access 'current' as-is. Tracing bpf programs will do the rest of walking the pointers via bpf_probe_read(). Note that current can be null and bpf program has to deal it with, but even dumb passing null into bpf_probe_read() is still safe. Suggested-by: Brendan Gregg Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 19c5b4a5c3eb..094c716154ed 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -312,6 +312,17 @@ const struct bpf_func_proto *bpf_get_event_output_proto(void) return &bpf_event_output_proto; } +static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return (long) current; +} + +static const struct bpf_func_proto bpf_get_current_task_proto = { + .func = bpf_get_current_task, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -329,6 +340,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_task: + return &bpf_get_current_task_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: -- cgit v1.2.3 From a536a6e13ecd0d6eb0ffc36c5d56555896617282 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 11 Jul 2016 12:51:01 -0400 Subject: bpf: make inode code explicitly non-modular The Kconfig currently controlling compilation of this code is: init/Kconfig:config BPF_SYSCALL init/Kconfig: bool "Enable bpf() system call" ...meaning that it currently is not being built as a module by anyone. Lets remove the couple traces of modular infrastructure use, so that when reading the driver there is no doubt it is builtin-only. Note that MODULE_ALIAS is a no-op for non-modular code. We replace module.h with init.h since the file does use __init. Cc: Alexei Starovoitov Cc: netdev@vger.kernel.org Signed-off-by: Paul Gortmaker Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 318858edb1cd..5967b870a895 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -11,7 +11,7 @@ * version 2 as published by the Free Software Foundation. */ -#include +#include #include #include #include @@ -367,8 +367,6 @@ static struct file_system_type bpf_fs_type = { .kill_sb = kill_litter_super, }; -MODULE_ALIAS_FS("bpf"); - static int __init bpf_init(void) { int ret; -- cgit v1.2.3 From 7e3f977edd0bd9ea6104156feba95bb5ae9bdd38 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Jul 2016 18:08:03 +0200 Subject: perf, events: add non-linear data support for raw records This patch adds support for non-linear data on raw records. It extends raw records to have one or multiple fragments that will be written linearly into the ring slot, where each fragment can optionally have a custom callback handler to walk and extract complex, possibly non-linear data. If a callback handler is provided for a fragment, then the new __output_custom() will be used instead of __output_copy() for the perf_output_sample() part. perf_prepare_sample() does all the size calculation only once, so perf_output_sample() doesn't need to redo the same work anymore, meaning real_size and padding will be cached in the raw record. The raw record becomes 32 bytes in size without holes; to not increase it further and to avoid doing unnecessary recalculations in fast-path, we can reuse next pointer of the last fragment, idea here is borrowed from ZERO_OR_NULL_PTR(), which should keep the perf_output_sample() path for PERF_SAMPLE_RAW minimal. This facility is needed for BPF's event output helper as a first user that will, in a follow-up, add an additional perf_raw_frag to its perf_raw_record in order to be able to more efficiently dump skb context after a linear head meta data related to it. skbs can be non-linear and thus need a custom output function to dump buffers. Currently, the skb data needs to be copied twice; with the help of __output_custom() this work only needs to be done once. Future users could be things like XDP/BPF programs that work on different context though and would thus also have a different callback function. The few users of raw records are adapted to initialize their frag data from the raw record itself, no change in behavior for them. The code is based upon a PoC diff provided by Peter Zijlstra [1]. [1] http://thread.gmane.org/gmane.linux.network/421294 Suggested-by: Peter Zijlstra Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/events/core.c | 66 +++++++++++++++++++++++++++++++++--------------- kernel/events/internal.h | 16 +++++++++--- kernel/trace/bpf_trace.c | 6 +++-- 3 files changed, 62 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9c51ec3f0f44..b1891b6b5c1f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5553,16 +5553,26 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - u32 raw_size = data->raw->size; - u32 real_size = round_up(raw_size + sizeof(u32), - sizeof(u64)) - sizeof(u32); - u64 zero = 0; - - perf_output_put(handle, real_size); - __output_copy(handle, data->raw->data, raw_size); - if (real_size - raw_size) - __output_copy(handle, &zero, real_size - raw_size); + struct perf_raw_record *raw = data->raw; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + + perf_output_put(handle, raw->size); + do { + if (frag->copy) { + __output_custom(handle, frag->copy, + frag->data, frag->size); + } else { + __output_copy(handle, frag->data, + frag->size); + } + if (perf_raw_frag_last(frag)) + break; + frag = frag->next; + } while (1); + if (frag->pad) + __output_skip(handle, NULL, frag->pad); } else { struct { u32 size; @@ -5687,14 +5697,28 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); + struct perf_raw_record *raw = data->raw; + int size; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + u32 sum = 0; + + do { + sum += frag->size; + if (perf_raw_frag_last(frag)) + break; + frag = frag->next; + } while (1); + + size = round_up(sum + sizeof(u32), sizeof(u64)); + raw->size = size - sizeof(u32); + frag->pad = raw->size - sum; + } else { + size = sizeof(u64); + } - header->size += round_up(size, sizeof(u64)); + header->size += size; } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { @@ -7331,7 +7355,7 @@ static struct pmu perf_swevent = { static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { - void *record = data->raw->data; + void *record = data->raw->frag.data; /* only top level events have filters set */ if (event->parent) @@ -7387,8 +7411,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct perf_event *event; struct perf_raw_record raw = { - .size = entry_size, - .data = record, + .frag = { + .size = entry_size, + .data = record, + }, }; perf_sample_data_init(&data, 0, 0); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 05f9f6d626df..2417eb5512cd 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -123,10 +123,7 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) return rb->aux_nr_pages << PAGE_SHIFT; } -#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ -static inline unsigned long \ -func_name(struct perf_output_handle *handle, \ - const void *buf, unsigned long len) \ +#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func) \ { \ unsigned long size, written; \ \ @@ -152,6 +149,17 @@ func_name(struct perf_output_handle *handle, \ return len; \ } +#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ +static inline unsigned long \ +func_name(struct perf_output_handle *handle, \ + const void *buf, unsigned long len) \ +__DEFINE_OUTPUT_COPY_BODY(memcpy_func) + +static inline unsigned long +__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func, + const void *buf, unsigned long len) +__DEFINE_OUTPUT_COPY_BODY(copy_func) + static inline unsigned long memcpy_common(void *dst, const void *src, unsigned long n) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 094c716154ed..35ab1b2b041b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -245,8 +245,10 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) struct bpf_event_entry *ee; struct perf_event *event; struct perf_raw_record raw = { - .size = size, - .data = data, + .frag = { + .size = size, + .data = data, + }, }; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) -- cgit v1.2.3 From 8e7a3920ac277dd4e690c0e70c9750176e3acb83 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Jul 2016 18:08:04 +0200 Subject: bpf, perf: split bpf_perf_event_output Split the bpf_perf_event_output() helper as a preparation into two parts. The new bpf_perf_event_output() will prepare the raw record itself and test for unknown flags from BPF trace context, where the __bpf_perf_event_output() does the core work. The latter will be reused later on from bpf_event_output() directly. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 35ab1b2b041b..c35883a9bc11 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -233,26 +233,17 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; -static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +static __always_inline u64 +__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, + u64 flags, struct perf_raw_record *raw) { - struct pt_regs *regs = (struct pt_regs *) (long) r1; - struct bpf_map *map = (struct bpf_map *) (long) r2; struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; - void *data = (void *) (long) r4; struct perf_sample_data sample_data; struct bpf_event_entry *ee; struct perf_event *event; - struct perf_raw_record raw = { - .frag = { - .size = size, - .data = data, - }, - }; - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; if (index == BPF_F_CURRENT_CPU) index = cpu; if (unlikely(index >= array->map.max_entries)) @@ -271,11 +262,29 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) return -EOPNOTSUPP; perf_sample_data_init(&sample_data, 0, 0); - sample_data.raw = &raw; + sample_data.raw = raw; perf_event_output(event, &sample_data, regs); return 0; } +static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +{ + struct pt_regs *regs = (struct pt_regs *)(long) r1; + struct bpf_map *map = (struct bpf_map *)(long) r2; + void *data = (void *)(long) r4; + struct perf_raw_record raw = { + .frag = { + .size = size, + .data = data, + }, + }; + + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return __bpf_perf_event_output(regs, map, flags, &raw); +} + static const struct bpf_func_proto bpf_perf_event_output_proto = { .func = bpf_perf_event_output, .gpl_only = true, -- cgit v1.2.3 From 555c8a8623a3a87b3c990ba30b7fd2e5914e41d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 14 Jul 2016 18:08:05 +0200 Subject: bpf: avoid stack copy and use skb ctx for event output This work addresses a couple of issues bpf_skb_event_output() helper currently has: i) We need two copies instead of just a single one for the skb data when it should be part of a sample. The data can be non-linear and thus needs to be extracted via bpf_skb_load_bytes() helper first, and then copied once again into the ring buffer slot. ii) Since bpf_skb_load_bytes() currently needs to be used first, the helper needs to see a constant size on the passed stack buffer to make sure BPF verifier can do sanity checks on it during verification time. Thus, just passing skb->len (or any other non-constant value) wouldn't work, but changing bpf_skb_load_bytes() is also not the proper solution, since the two copies are generally still needed. iii) bpf_skb_load_bytes() is just for rather small buffers like headers, since they need to sit on the limited BPF stack anyway. Instead of working around in bpf_skb_load_bytes(), this work improves the bpf_skb_event_output() helper to address all 3 at once. We can make use of the passed in skb context that we have in the helper anyway, and use some of the reserved flag bits as a length argument. The helper will use the new __output_custom() facility from perf side with bpf_skb_copy() as callback helper to walk and extract the data. It will pass the data for setup to bpf_event_output(), which generates and pushes the raw record with an additional frag part. The linear data used in the first frag of the record serves as programmatically defined meta data passed along with the appended sample. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 6 ++++-- kernel/trace/bpf_trace.c | 33 +++++++++++++++------------------ 2 files changed, 19 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d638062f66d6..03fd23d4d587 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1054,9 +1054,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) return NULL; } -const struct bpf_func_proto * __weak bpf_get_event_output_proto(void) +u64 __weak +bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - return NULL; + return -ENOTSUPP; } /* Always built-in helper functions. */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c35883a9bc11..ebfbb7dd7033 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -298,29 +298,26 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); -static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) +u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); + struct perf_raw_frag frag = { + .copy = ctx_copy, + .size = ctx_size, + .data = ctx, + }; + struct perf_raw_record raw = { + .frag = { + .next = ctx_size ? &frag : NULL, + .size = meta_size, + .data = meta, + }, + }; perf_fetch_caller_regs(regs); - return bpf_perf_event_output((long)regs, r2, flags, r4, size); -} - -static const struct bpf_func_proto bpf_event_output_proto = { - .func = bpf_event_output, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, -}; - -const struct bpf_func_proto *bpf_get_event_output_proto(void) -{ - return &bpf_event_output_proto; + return __bpf_perf_event_output(regs, map, flags, &raw); } static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) -- cgit v1.2.3 From 858d68f10238fdd1ebdd0096f912f063e97c6766 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 16 Jul 2016 01:15:55 +0200 Subject: bpf: bpf_event_entry_gen's alloc needs to be in atomic context Should have been obvious, only called from bpf() syscall via map_update_elem() that calls bpf_fd_array_map_update_elem() under RCU read lock and thus this must also be in GFP_ATOMIC, of course. Fixes: 3b1efb196eee ("bpf, maps: flush own entries on perf map release") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index db1a743e3db2..633a650d7aeb 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -430,7 +430,7 @@ static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, { struct bpf_event_entry *ee; - ee = kzalloc(sizeof(*ee), GFP_KERNEL); + ee = kzalloc(sizeof(*ee), GFP_ATOMIC); if (ee) { ee->event = perf_file->private_data; ee->perf_file = perf_file; -- cgit v1.2.3 From 183fc1537ec39be242dc8b619f71fc11b393d295 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 18 Jul 2016 15:50:58 -0700 Subject: kernel/trace/bpf_trace.c: work around gcc-4.4.4 anon union initialization bug kernel/trace/bpf_trace.c: In function 'bpf_event_output': kernel/trace/bpf_trace.c:312: error: unknown field 'next' specified in initializer kernel/trace/bpf_trace.c:312: warning: missing braces around initializer kernel/trace/bpf_trace.c:312: warning: (near initialization for 'raw.frag.') Fixes: 555c8a8623a3a87 ("bpf: avoid stack copy and use skb ctx for event output") Acked-by: Daniel Borkmann Cc: Alexei Starovoitov Cc: David S. Miller Signed-off-by: Andrew Morton Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ebfbb7dd7033..a12bbd32c0a6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -309,7 +309,9 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; struct perf_raw_record raw = { .frag = { - .next = ctx_size ? &frag : NULL, + { + .next = ctx_size ? &frag : NULL, + }, .size = meta_size, .data = meta, }, -- cgit v1.2.3 From 59d3656d5bf504f771fc44fdbc7a9a8590795f22 Mon Sep 17 00:00:00 2001 From: Brenden Blanco Date: Tue, 19 Jul 2016 12:16:46 -0700 Subject: bpf: add bpf_prog_add api for bulk prog refcnt A subsystem may need to store many copies of a bpf program, each deserving its own reference. Rather than requiring the caller to loop one by one (with possible mid-loop failure), add a bulk bpf_prog_add api. Signed-off-by: Brenden Blanco Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 96d938a22050..228f962447a5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -670,14 +670,20 @@ static struct bpf_prog *____bpf_prog_get(struct fd f) return f.file->private_data; } -struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) { - if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { - atomic_dec(&prog->aux->refcnt); + if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { + atomic_sub(i, &prog->aux->refcnt); return ERR_PTR(-EBUSY); } return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_add); + +struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + return bpf_prog_add(prog, 1); +} static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { -- cgit v1.2.3 From 6a773a15a1e8874e5eccd2f29190c31085912c95 Mon Sep 17 00:00:00 2001 From: Brenden Blanco Date: Tue, 19 Jul 2016 12:16:47 -0700 Subject: bpf: add XDP prog type for early driver filter Add a new bpf prog type that is intended to run in early stages of the packet rx path. Only minimal packet metadata will be available, hence a new context type, struct xdp_md, is exposed to userspace. So far only expose the packet start and end pointers, and only in read mode. An XDP program must return one of the well known enum values, all other return codes are reserved for future use. Unfortunately, this restriction is hard to enforce at verification time, so take the approach of warning at runtime when such programs are encountered. Out of bounds return codes should alias to XDP_ABORTED. Signed-off-by: Brenden Blanco Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e206c2181412..a8d67d097b0d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -713,6 +713,7 @@ static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg, switch (env->prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: break; default: verbose("verifier is misconfigured\n"); -- cgit v1.2.3 From 4acf6c0b84c91243c705303cd9ff16421914150d Mon Sep 17 00:00:00 2001 From: Brenden Blanco Date: Tue, 19 Jul 2016 12:16:56 -0700 Subject: bpf: enable direct packet data write for xdp progs For forwarding to be effective, XDP programs should be allowed to rewrite packet data. This requires that the drivers supporting XDP must all map the packet memory as TODEVICE or BIDIRECTIONAL before invoking the program. Signed-off-by: Brenden Blanco Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a8d67d097b0d..f72f23b8fdab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -653,6 +653,16 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff +static bool may_write_pkt_data(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_XDP: + return true; + default: + return false; + } +} + static int check_packet_access(struct verifier_env *env, u32 regno, int off, int size) { @@ -806,10 +816,15 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE) { + if (t == BPF_WRITE && !may_write_pkt_data(env->prog->type)) { verbose("cannot write into packet\n"); return -EACCES; } + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose("R%d leaks addr into packet\n", value_regno); + return -EACCES; + } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown_value(state->regs, value_regno); -- cgit v1.2.3 From aa7145c16d6bf086538ad7eb20c807513bfa5efc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Jul 2016 01:19:42 +0200 Subject: bpf, events: fix offset in skb copy handler This patch fixes the __output_custom() routine we currently use with bpf_skb_copy(). I missed that when len is larger than the size of the current handle, we can issue multiple invocations of copy_func, and __output_custom() advances destination but also source buffer by the written amount of bytes. When we have __output_custom(), this is actually wrong since in that case the source buffer points to a non-linear object, in our case an skb, which the copy_func helper is supposed to walk. Therefore, since this is non-linear we thus need to pass the offset into the helper, so that copy_func can use it for extracting the data from the source object. Therefore, adjust the callback signatures properly and pass offset into the skb_header_pointer() invoked from bpf_skb_copy() callback. The __DEFINE_OUTPUT_COPY_BODY() is adjusted to accommodate for two things: i) to pass in whether we should advance source buffer or not; this is a compile-time constant condition, ii) to pass in the offset for __output_custom(), which we do with help of __VA_ARGS__, so everything can stay inlined as is currently. Both changes allow for adapting the __output_* fast-path helpers w/o extra overhead. Fixes: 555c8a8623a3 ("bpf: avoid stack copy and use skb ctx for event output") Fixes: 7e3f977edd0b ("perf, events: add non-linear data support for raw records") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/events/internal.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2417eb5512cd..486fd78eb8d5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -123,18 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) return rb->aux_nr_pages << PAGE_SHIFT; } -#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func) \ +#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \ { \ unsigned long size, written; \ \ do { \ size = min(handle->size, len); \ - written = memcpy_func(handle->addr, buf, size); \ + written = memcpy_func(__VA_ARGS__); \ written = size - written; \ \ len -= written; \ handle->addr += written; \ - buf += written; \ + if (advance_buf) \ + buf += written; \ handle->size -= written; \ if (!handle->size) { \ struct ring_buffer *rb = handle->rb; \ @@ -153,12 +154,16 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) static inline unsigned long \ func_name(struct perf_output_handle *handle, \ const void *buf, unsigned long len) \ -__DEFINE_OUTPUT_COPY_BODY(memcpy_func) +__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size) static inline unsigned long __output_custom(struct perf_output_handle *handle, perf_copy_f copy_func, const void *buf, unsigned long len) -__DEFINE_OUTPUT_COPY_BODY(copy_func) +{ + unsigned long orig_len = len; + __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf, + orig_len - len, size) +} static inline unsigned long memcpy_common(void *dst, const void *src, unsigned long n) -- cgit v1.2.3 From 96ae52279594470622ff0585621a13e96b700600 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Mon, 25 Jul 2016 05:54:46 -0700 Subject: bpf: Add bpf_probe_write_user BPF helper to be called in tracers This allows user memory to be written to during the course of a kprobe. It shouldn't be used to implement any kind of security mechanism because of TOC-TOU attacks, but rather to debug, divert, and manipulate execution of semi-cooperative processes. Although it uses probe_kernel_write, we limit the address space the probe can write into by checking the space with access_ok. We do this as opposed to calling copy_to_user directly, in order to avoid sleeping. In addition we ensure the threads's current fs / segment is USER_DS and the thread isn't exiting nor a kernel thread. Given this feature is meant for experiments, and it has a risk of crashing the system, and running programs, we print a warning on when a proglet that attempts to use this helper is installed, along with the pid and process name. Signed-off-by: Sargun Dhillon Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a12bbd32c0a6..b20438fdb029 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; +static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + void *unsafe_ptr = (void *) (long) r1; + void *src = (void *) (long) r2; + int size = (int) r3; + + /* + * Ensure we're in user context which is safe for the helper to + * run. This helper has no business in a kthread. + * + * access_ok() should prevent writing to non-user memory, but in + * some situations (nommu, temporary switch, etc) access_ok() does + * not provide enough validation, hence the check on KERNEL_DS. + */ + + if (unlikely(in_interrupt() || + current->flags & (PF_KTHREAD | PF_EXITING))) + return -EPERM; + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) + return -EPERM; + if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) + return -EPERM; + + return probe_kernel_write(unsafe_ptr, src, size); +} + +static const struct bpf_func_proto bpf_probe_write_user_proto = { + .func = bpf_probe_write_user, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto *bpf_get_probe_write_proto(void) +{ + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", + current->comm, task_pid_nr(current)); + + return &bpf_probe_write_user_proto; +} + /* * limited trace_printk() * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed @@ -362,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; + case BPF_FUNC_probe_write_user: + return bpf_get_probe_write_proto(); default: return NULL; } -- cgit v1.2.3