From 95ec66968571bf0af0a22effdc1b9d9e62ea6630 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 20 Sep 2017 09:11:56 -0700 Subject: samples/bpf: Use getppid instead of getpgrp for array map stress When cross-compiling the bpf sample map_perf_test for aarch64, I find that __NR_getpgrp is undefined. This causes build errors. This syscall is deprecated and requires defining __ARCH_WANT_SYSCALL_DEPRECATED. To avoid having to define that, just use a different syscall (getppid) for the array map stress test. Acked-by: Alexei Starovoitov Signed-off-by: Joel Fernandes Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/map_perf_test_kern.c | 2 +- samples/bpf/map_perf_test_user.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'samples') diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index 098c857f1eda..2b2ffb97018b 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -266,7 +266,7 @@ int stress_hash_map_lookup(struct pt_regs *ctx) return 0; } -SEC("kprobe/sys_getpgrp") +SEC("kprobe/sys_getppid") int stress_array_map_lookup(struct pt_regs *ctx) { u32 key = 1, i; diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index f388254896f6..a0310fc70057 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -282,7 +282,7 @@ static void test_array_lookup(int cpu) start_time = time_get_ns(); for (i = 0; i < max_cnt; i++) - syscall(__NR_getpgrp, 0); + syscall(__NR_getppid, 0); printf("%d:array_lookup %lld lookups per sec\n", cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time)); } -- cgit v1.2.3 From 876e88e3273e300895e308bd660c6cfaabb03cd5 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 20 Sep 2017 09:11:57 -0700 Subject: samples/bpf: Enable cross compiler support When cross compiling, bpf samples use HOSTCC for compiling the non-BPF part of the sample, however what we really want is to use the cross compiler to build for the cross target since that is what will load and run the BPF sample. Detect this and compile samples correctly. Acked-by: Alexei Starovoitov Signed-off-by: Joel Fernandes Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/Makefile | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index cf17c7932a6e..13f74b67ca44 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -177,6 +177,11 @@ HOSTLOADLIBES_syscall_tp += -lelf LLC ?= llc CLANG ?= clang +# Detect that we're cross compiling and use the cross compiler +ifdef CROSS_COMPILE +HOSTCC = $(CROSS_COMPILE)gcc +endif + # Trick to allow make to be run from this directory all: $(MAKE) -C ../../ $(CURDIR)/ -- cgit v1.2.3 From b655fc1c2ee1c2b2d07c0b6f432798b91202718c Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 20 Sep 2017 09:11:58 -0700 Subject: samples/bpf: Fix pt_regs issues when cross-compiling BPF samples fail to build when cross-compiling for ARM64 because of incorrect pt_regs param selection. This is because clang defines __x86_64__ and bpf_headers thinks we're building for x86. Since clang is building for the BPF target, it shouldn't make assumptions about what target the BPF program is going to run on. To fix this, lets pass ARCH so the header knows which target the BPF program is being compiled for and can use the correct pt_regs code. Acked-by: Alexei Starovoitov Signed-off-by: Joel Fernandes Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 13f74b67ca44..ebc2ad69b62c 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -230,7 +230,7 @@ $(obj)/%.o: $(src)/%.c $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \ -I$(srctree)/tools/testing/selftests/bpf/ \ -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \ - -Wno-compare-distinct-pointer-types \ + -D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \ -Wno-gnu-variable-sized-type-not-at-end \ -Wno-address-of-packed-member -Wno-tautological-compare \ -Wno-unknown-warning-option \ -- cgit v1.2.3 From 8bf2ac25a96c69985a1a9fbbad7da22ae4343a38 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 20 Sep 2017 09:11:59 -0700 Subject: samples/bpf: Add documentation on cross compilation Acked-by: Alexei Starovoitov Signed-off-by: Joel Fernandes Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/README.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'samples') diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst index 79f9a58f1872..5f27e4faca50 100644 --- a/samples/bpf/README.rst +++ b/samples/bpf/README.rst @@ -64,3 +64,13 @@ It is also possible to point make to the newly compiled 'llc' or 'clang' command via redefining LLC or CLANG on the make command line:: make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang + +Cross compiling samples +----------------------- +In order to cross-compile, say for arm64 targets, export CROSS_COMPILE and ARCH +environment variables before calling make. This will direct make to build +samples for the cross target. + +export ARCH=arm64 +export CROSS_COMPILE="aarch64-linux-gnu-" +make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang -- cgit v1.2.3 From 88cda1c9da02c8aa31e1d5dcf22e8a35cc8c19f2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:54 -0700 Subject: bpf: libbpf: Provide basic API support to specify BPF obj name This patch extends the libbpf to provide API support to allow specifying BPF object name. In tools/lib/bpf/libbpf, the C symbol of the function and the map is used. Regarding section name, all maps are under the same section named "maps". Hence, section name is not a good choice for map's name. To be consistent with map, bpf_prog also follows and uses its function symbol as the prog's name. This patch adds logic to collect function's symbols in libbpf. There is existing codes to collect the map's symbols and no change is needed. The bpf_load_program_name() and bpf_map_create_name() are added to take the name argument. For the other bpf_map_create_xxx() variants, a name argument is directly added to them. In samples/bpf, bpf_load.c in particular, the symbol is also used as the map's name and the map symbols has already been collected in the existing code. For bpf_prog, bpf_load.c does not collect the function symbol name. We can consider to collect them later if there is a need to continue supporting the bpf_load.c. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/bpf_load.c | 2 ++ samples/bpf/map_perf_test_user.c | 1 + 2 files changed, 3 insertions(+) (limited to 'samples') diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 6aa50098dfb8..18b1c8dd0391 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -221,6 +221,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps, int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, + maps[i].name, maps[i].def.key_size, inner_map_fd, maps[i].def.max_entries, @@ -228,6 +229,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps, numa_node); } else { map_fd[i] = bpf_create_map_node(maps[i].def.type, + maps[i].name, maps[i].def.key_size, maps[i].def.value_size, maps[i].def.max_entries, diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index a0310fc70057..519d9af4b04a 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -137,6 +137,7 @@ static void do_test_lru(enum test_type test, int cpu) inner_lru_map_fds[cpu] = bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH, + test_map_names[INNER_LRU_HASH_PREALLOC], sizeof(uint32_t), sizeof(long), inner_lru_hash_size, 0, -- cgit v1.2.3 From 0929567a7a2dab8455a7313956973ff0d339709a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 1 Oct 2017 14:07:34 -0700 Subject: samples/bpf: fix warnings in xdp_monitor_user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make local functions static to fix HOSTCC samples/bpf/xdp_monitor_user.o samples/bpf/xdp_monitor_user.c:64:7: warning: no previous prototype for ‘gettime’ [-Wmissing-prototypes] __u64 gettime(void) ^~~~~~~ samples/bpf/xdp_monitor_user.c:209:6: warning: no previous prototype for ‘print_bpf_prog_info’ [-Wmissing-prototypes] void print_bpf_prog_info(void) ^~~~~~~~~~~~~~~~~~~ Fixes: 3ffab5460264 ("samples/bpf: xdp_monitor tool based on tracepoints") Signed-off-by: Stephen Hemminger Acked-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- samples/bpf/xdp_monitor_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'samples') diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index b51b4f5e3257..c5ab8b776973 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -61,7 +61,7 @@ static void usage(char *argv[]) } #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ -__u64 gettime(void) +static __u64 gettime(void) { struct timespec t; int res; @@ -206,7 +206,7 @@ static void stats_poll(int interval, bool err_only) } } -void print_bpf_prog_info(void) +static void print_bpf_prog_info(void) { int i; -- cgit v1.2.3 From 39323e788cb672adba8709ca407bd6763aae577d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:25 -0700 Subject: samples/bpf: add multi-prog cgroup test case create 5 cgroups, attach 6 progs and check that progs are executed as: cgrp1 (MULTI progs A, B) -> cgrp2 (OVERRIDE prog C) -> cgrp3 (MULTI prog D) -> cgrp4 (OVERRIDE prog E) -> cgrp5 (NONE prog F) the event in cgrp5 triggers execution of F,D,A,B in that order. if prog F is detached, the execution is E,D,A,B if prog F and D are detached, the execution is E,A,B if prog F, E and D are detached, the execution is C,A,B Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/cgroup_helpers.c | 4 +- samples/bpf/test_cgrp2_attach2.c | 188 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 185 insertions(+), 7 deletions(-) (limited to 'samples') diff --git a/samples/bpf/cgroup_helpers.c b/samples/bpf/cgroup_helpers.c index 9d1be9426401..88bdcf4b1670 100644 --- a/samples/bpf/cgroup_helpers.c +++ b/samples/bpf/cgroup_helpers.c @@ -56,7 +56,7 @@ int setup_cgroup_environment(void) return 1; } - if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL)) { + if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL) && errno != EBUSY) { log_err("mount cgroup2"); return 1; } @@ -163,7 +163,7 @@ int create_and_get_cgroup(char *path) format_cgroup_path(cgroup_path, path); if (mkdir(cgroup_path, 0777) && errno != EEXIST) { - log_err("mkdiring cgroup"); + log_err("mkdiring cgroup %s .. %s", path, cgroup_path); return 0; } diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c index 3049b1f26267..9a9f6836e5e9 100644 --- a/samples/bpf/test_cgrp2_attach2.c +++ b/samples/bpf/test_cgrp2_attach2.c @@ -30,7 +30,7 @@ #define FOO "/foo" #define BAR "/foo/bar/" -#define PING_CMD "ping -c1 -w1 127.0.0.1" +#define PING_CMD "ping -c1 -w1 127.0.0.1 > /dev/null" char bpf_log_buf[BPF_LOG_BUF_SIZE]; @@ -55,8 +55,7 @@ static int prog_load(int verdict) return ret; } - -int main(int argc, char **argv) +static int test_foo_bar(void) { int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0; @@ -189,8 +188,187 @@ out: close(bar); cleanup_cgroup_environment(); if (!rc) - printf("PASS\n"); + printf("### override:PASS\n"); + else + printf("### override:FAIL\n"); + return rc; +} + +static int map_fd = -1; + +static int prog_load_cnt(int verdict, int val) +{ + if (map_fd < 0) + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); + if (map_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + return -1; + } + + struct bpf_insn prog[] = { + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ + BPF_EXIT_INSN(), + }; + size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + int ret; + + ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, + prog, insns_cnt, "GPL", 0, + bpf_log_buf, BPF_LOG_BUF_SIZE); + + if (ret < 0) { + log_err("Loading program"); + printf("Output from verifier:\n%s\n-------\n", bpf_log_buf); + return 0; + } + return ret; +} + + +static int test_multiprog(void) +{ + int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0; + int drop_prog, allow_prog[6] = {}, rc = 0; + unsigned long long value; + int i = 0; + + for (i = 0; i < 6; i++) { + allow_prog[i] = prog_load_cnt(1, 1 << i); + if (!allow_prog[i]) + goto err; + } + drop_prog = prog_load_cnt(0, 1); + if (!drop_prog) + goto err; + + if (setup_cgroup_environment()) + goto err; + + cg1 = create_and_get_cgroup("/cg1"); + if (!cg1) + goto err; + cg2 = create_and_get_cgroup("/cg1/cg2"); + if (!cg2) + goto err; + cg3 = create_and_get_cgroup("/cg1/cg2/cg3"); + if (!cg3) + goto err; + cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4"); + if (!cg4) + goto err; + cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5"); + if (!cg5) + goto err; + + if (join_cgroup("/cg1/cg2/cg3/cg4/cg5")) + goto err; + + if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) { + log_err("Attaching prog to cg1"); + goto err; + } + if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) { + log_err("Unexpected success attaching the same prog to cg1"); + goto err; + } + if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS, 2)) { + log_err("Attaching prog2 to cg1"); + goto err; + } + if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS, 1)) { + log_err("Attaching prog to cg2"); + goto err; + } + if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS, 2)) { + log_err("Attaching prog to cg3"); + goto err; + } + if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS, 1)) { + log_err("Attaching prog to cg4"); + goto err; + } + if (bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0)) { + log_err("Attaching prog to cg5"); + goto err; + } + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 8 + 32); + + /* detach bottom program and ping again */ + if (bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching prog from cg5"); + goto err; + } + value = 0; + assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 8 + 16); + + /* detach 3rd from bottom program and ping again */ + errno = 0; + if (!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS)) { + log_err("Unexpected success on detach from cg3"); + goto err; + } + if (bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching from cg3"); + goto err; + } + value = 0; + assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 16); + + /* detach 2nd from bottom program and ping again */ + if (bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching prog from cg4"); + goto err; + } + value = 0; + assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 4); + goto out; +err: + rc = 1; + +out: + for (i = 0; i < 6; i++) + if (allow_prog[i] > 0) + close(allow_prog[i]); + close(cg1); + close(cg2); + close(cg3); + close(cg4); + close(cg5); + cleanup_cgroup_environment(); + if (!rc) + printf("### multi:PASS\n"); else - printf("FAIL\n"); + printf("### multi:FAIL\n"); return rc; } + +int main(int argc, char **argv) +{ + int rc = 0; + + rc = test_foo_bar(); + if (rc) + return rc; + + return test_multiprog(); +} -- cgit v1.2.3 From dfc069998ebb010f910dfec379dab4f44d331980 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:28 -0700 Subject: samples/bpf: use bpf_prog_query() interface use BPF_PROG_QUERY command to strengthen test coverage Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/test_cgrp2_attach2.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'samples') diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c index 9a9f6836e5e9..3e8232cc04a8 100644 --- a/samples/bpf/test_cgrp2_attach2.c +++ b/samples/bpf/test_cgrp2_attach2.c @@ -236,6 +236,7 @@ static int prog_load_cnt(int verdict, int val) static int test_multiprog(void) { + __u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id; int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0; int drop_prog, allow_prog[6] = {}, rc = 0; unsigned long long value; @@ -304,6 +305,32 @@ static int test_multiprog(void) assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); assert(value == 1 + 2 + 8 + 32); + /* query the number of effective progs in cg5 */ + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + NULL, NULL, &prog_cnt) == 0); + assert(prog_cnt == 4); + /* retrieve prog_ids of effective progs in cg5 */ + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + &attach_flags, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 4); + assert(attach_flags == 0); + saved_prog_id = prog_ids[0]; + /* check enospc handling */ + prog_ids[0] = 0; + prog_cnt = 2; + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + &attach_flags, prog_ids, &prog_cnt) == -1 && + errno == ENOSPC); + assert(prog_cnt == 4); + /* check that prog_ids are returned even when buffer is too small */ + assert(prog_ids[0] == saved_prog_id); + /* retrieve prog_id of single attached prog in cg5 */ + prog_ids[0] = 0; + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, + NULL, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 1); + assert(prog_ids[0] == saved_prog_id); + /* detach bottom program and ping again */ if (bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS)) { log_err("Detaching prog from cg5"); @@ -341,6 +368,15 @@ static int test_multiprog(void) assert(system(PING_CMD) == 0); assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); assert(value == 1 + 2 + 4); + + prog_cnt = 4; + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + &attach_flags, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 3); + assert(attach_flags == 0); + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, + NULL, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 0); goto out; err: rc = 1; -- cgit v1.2.3 From f4ce0a0116bc90803adac10865f14429313cb2b6 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 6 Oct 2017 10:41:41 +0200 Subject: samples/bpf: xdp_monitor first 8 bytes are not accessible by bpf The first 8 bytes of the tracepoint context struct are not accessible by the bpf code. This is a choice that dates back to the original inclusion of this code. See explaination in: commit 98b5c2c65c29 ("perf, bpf: allow bpf programs attach to tracepoints") Signed-off-by: Jesper Dangaard Brouer Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- samples/bpf/xdp_monitor_kern.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'samples') diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index 74f3fd8ed729..cc7e19d2ad76 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -17,19 +17,15 @@ struct bpf_map_def SEC("maps") redirect_err_cnt = { * Code in: kernel/include/trace/events/xdp.h */ struct xdp_redirect_ctx { - unsigned short common_type; // offset:0; size:2; signed:0; - unsigned char common_flags; // offset:2; size:1; signed:0; - unsigned char common_preempt_count;// offset:3; size:1; signed:0; - int common_pid; // offset:4; size:4; signed:1; - - int prog_id; // offset:8; size:4; signed:1; - u32 act; // offset:12 size:4; signed:0; - int ifindex; // offset:16 size:4; signed:1; - int err; // offset:20 size:4; signed:1; - int to_ifindex; // offset:24 size:4; signed:1; - u32 map_id; // offset:28 size:4; signed:0; - int map_index; // offset:32 size:4; signed:1; -}; // offset:36 + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12 size:4; signed:0; + int ifindex; // offset:16 size:4; signed:1; + int err; // offset:20 size:4; signed:1; + int to_ifindex; // offset:24 size:4; signed:1; + u32 map_id; // offset:28 size:4; signed:0; + int map_index; // offset:32 size:4; signed:1; +}; // offset:36 enum { XDP_REDIRECT_SUCCESS = 0, -- cgit v1.2.3 From 280b058d4801cb431477dc101a776e4b24995f2f Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 6 Oct 2017 10:41:46 +0200 Subject: samples/bpf: xdp_monitor also record xdp_exception tracepoint Also monitor the tracepoint xdp_exception. This tracepoint is usually invoked by the drivers. Programs themselves can activate this by returning XDP_ABORTED, which will drop the packet but also trigger the tracepoint. This is useful for distinguishing intentional (XDP_DROP) vs. ebpf-program error cases that cased a drop (XDP_ABORTED). Drivers also use this tracepoint for reporting on XDP actions that are unknown to the specific driver. This can help the user to detect if a driver e.g. doesn't implement XDP_REDIRECT yet. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- samples/bpf/xdp_monitor_kern.c | 38 ++++++++++++++- samples/bpf/xdp_monitor_user.c | 108 ++++++++++++++++++++++++++++++++--------- 2 files changed, 121 insertions(+), 25 deletions(-) (limited to 'samples') diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index cc7e19d2ad76..2fe2f761a0d0 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -13,6 +13,14 @@ struct bpf_map_def SEC("maps") redirect_err_cnt = { /* TODO: have entries for all possible errno's */ }; +#define XDP_UNKNOWN XDP_REDIRECT + 1 +struct bpf_map_def SEC("maps") exception_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = XDP_UNKNOWN + 1, +}; + /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format * Code in: kernel/include/trace/events/xdp.h */ @@ -44,7 +52,7 @@ int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) cnt = bpf_map_lookup_elem(&redirect_err_cnt, &key); if (!cnt) - return 0; + return 1; *cnt += 1; return 0; /* Indicate event was filtered (no further processing)*/ @@ -82,3 +90,31 @@ int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx) { return xdp_redirect_collect_stat(ctx); } + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_exception_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int ifindex; // offset:16; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_exception") +int trace_xdp_exception(struct xdp_exception_ctx *ctx) +{ + u64 *cnt;; + u32 key; + + key = ctx->act; + if (key > XDP_REDIRECT) + key = XDP_UNKNOWN; + + cnt = bpf_map_lookup_elem(&exception_cnt, &key); + if (!cnt) + return 1; + *cnt += 1; + + return 0; +} diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index c5ab8b776973..97c3456c11b2 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -89,6 +89,23 @@ static const char *err2str(int err) return redir_names[err]; return NULL; } +/* enum xdp_action */ +#define XDP_UNKNOWN XDP_REDIRECT + 1 +#define XDP_ACTION_MAX (XDP_UNKNOWN + 1) +static const char *xdp_action_names[XDP_ACTION_MAX] = { + [XDP_ABORTED] = "XDP_ABORTED", + [XDP_DROP] = "XDP_DROP", + [XDP_PASS] = "XDP_PASS", + [XDP_TX] = "XDP_TX", + [XDP_REDIRECT] = "XDP_REDIRECT", + [XDP_UNKNOWN] = "XDP_UNKNOWN", +}; +static const char *action2str(int action) +{ + if (action < XDP_ACTION_MAX) + return xdp_action_names[action]; + return NULL; +} struct record { __u64 counter; @@ -97,6 +114,7 @@ struct record { struct stats_record { struct record xdp_redir[REDIR_RES_MAX]; + struct record xdp_exception[XDP_ACTION_MAX]; }; static void stats_print_headers(bool err_only) @@ -104,39 +122,72 @@ static void stats_print_headers(bool err_only) if (err_only) printf("\n%s\n", __doc_err_only__); - printf("%-14s %-10s %-18s %-9s\n", - "XDP_REDIRECT", "pps ", "pps-human-readable", "measure-period"); + printf("%-14s %-11s %-10s %-18s %-9s\n", + "ACTION", "result", "pps ", "pps-human-readable", "measure-period"); +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static double calc_pps(struct record *r, struct record *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->counter - p->counter; + pps = packets / period; + } + return pps; } static void stats_print(struct stats_record *rec, struct stats_record *prev, bool err_only) { + double period = 0, pps = 0; + struct record *r, *p; int i = 0; + char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n"; + + /* tracepoint: xdp:xdp_redirect_* */ if (err_only) i = REDIR_ERROR; for (; i < REDIR_RES_MAX; i++) { - struct record *r = &rec->xdp_redir[i]; - struct record *p = &prev->xdp_redir[i]; - __u64 period = 0; - __u64 packets = 0; - double pps = 0; - double period_ = 0; + r = &rec->xdp_redir[i]; + p = &prev->xdp_redir[i]; if (p->timestamp) { - packets = r->counter - p->counter; - period = r->timestamp - p->timestamp; - if (period > 0) { - period_ = ((double) period / NANOSEC_PER_SEC); - pps = packets / period_; - } + period = calc_period(r, p); + pps = calc_pps(r, p, period); } + printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period); + } - printf("%-14s %-10.0f %'-18.0f %f\n", - err2str(i), pps, pps, period_); + /* tracepoint: xdp:xdp_exception */ + for (i = 0; i < XDP_ACTION_MAX; i++) { + r = &rec->xdp_exception[i]; + p = &prev->xdp_exception[i]; + if (p->timestamp) { + period = calc_period(r, p); + pps = calc_pps(r, p, period); + } + if (pps > 0) + printf(fmt, action2str(i), "Exception", + pps, pps, period); } + printf("\n"); } static __u64 get_key32_value64_percpu(int fd, __u32 key) @@ -160,25 +211,33 @@ static __u64 get_key32_value64_percpu(int fd, __u32 key) return sum; } -static bool stats_collect(int fd, struct stats_record *rec) +static bool stats_collect(struct stats_record *rec) { + int fd; int i; /* TODO: Detect if someone unloaded the perf event_fd's, as * this can happen by someone running perf-record -e */ + fd = map_data[0].fd; /* map0: redirect_err_cnt */ for (i = 0; i < REDIR_RES_MAX; i++) { rec->xdp_redir[i].timestamp = gettime(); rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i); } + + fd = map_data[1].fd; /* map1: exception_cnt */ + for (i = 0; i < XDP_ACTION_MAX; i++) { + rec->xdp_exception[i].timestamp = gettime(); + rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i); + } + return true; } static void stats_poll(int interval, bool err_only) { struct stats_record rec, prev; - int map_fd; memset(&rec, 0, sizeof(rec)); @@ -190,16 +249,17 @@ static void stats_poll(int interval, bool err_only) printf("\n%s", __doc__); /* TODO Need more advanced stats on error types */ - if (verbose) - printf(" - Stats map: %s\n", map_data[0].name); - map_fd = map_data[0].fd; - - stats_print_headers(err_only); + if (verbose) { + printf(" - Stats map0: %s\n", map_data[0].name); + printf(" - Stats map1: %s\n", map_data[1].name); + printf("\n"); + } fflush(stdout); while (1) { memcpy(&prev, &rec, sizeof(rec)); - stats_collect(map_fd, &rec); + stats_collect(&rec); + stats_print_headers(err_only); stats_print(&rec, &prev, err_only); fflush(stdout); sleep(interval); -- cgit v1.2.3 From c4eb7f4643ce3741e63b7d3f9b70bb7a637e738a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 6 Oct 2017 10:41:51 +0200 Subject: samples/bpf: xdp_monitor increase memory rlimit Other concurrent running programs, like perf or the XDP program what needed to be monitored, might take up part of the max locked memory limit. Thus, the xdp_monitor tool have to set the RLIMIT_MEMLOCK to RLIM_INFINITY, as it cannot determine a more sane limit. Using the man exit(3) specified EXIT_FAILURE return exit code, and correct other users too. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- samples/bpf/xdp_monitor_user.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'samples') diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index 97c3456c11b2..eaba165b3549 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -20,6 +20,7 @@ static const char *__doc_err_only__= #include #include +#include #include #include #include @@ -295,6 +296,7 @@ static void print_bpf_prog_info(void) int main(int argc, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int longindex = 0, opt; int ret = EXIT_SUCCESS; char bpf_obj_file[256]; @@ -325,13 +327,18 @@ int main(int argc, char **argv) } } + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return EXIT_FAILURE; + } + if (load_bpf_file(bpf_obj_file)) { printf("ERROR - bpf_log_buf: %s", bpf_log_buf); - return 1; + return EXIT_FAILURE; } if (!prog_fd[0]) { printf("ERROR - load_bpf_file: %s\n", strerror(errno)); - return 1; + return EXIT_FAILURE; } if (debug) { -- cgit v1.2.3 From 020a32d9581ac824d038b0b4e24e977e3cc8589f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:21 -0700 Subject: bpf: add a test case for helper bpf_perf_event_read_value The bpf sample program tracex6 is enhanced to use the new helper to read enabled/running time as well. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/tracex6_kern.c | 26 ++++++++++++++++++++++++++ samples/bpf/tracex6_user.c | 13 ++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) (limited to 'samples') diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c index e7d180305974..46c557afac73 100644 --- a/samples/bpf/tracex6_kern.c +++ b/samples/bpf/tracex6_kern.c @@ -15,6 +15,12 @@ struct bpf_map_def SEC("maps") values = { .value_size = sizeof(u64), .max_entries = 64, }; +struct bpf_map_def SEC("maps") values2 = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(int), + .value_size = sizeof(struct bpf_perf_event_value), + .max_entries = 64, +}; SEC("kprobe/htab_map_get_next_key") int bpf_prog1(struct pt_regs *ctx) @@ -37,5 +43,25 @@ int bpf_prog1(struct pt_regs *ctx) return 0; } +SEC("kprobe/htab_map_lookup_elem") +int bpf_prog2(struct pt_regs *ctx) +{ + u32 key = bpf_get_smp_processor_id(); + struct bpf_perf_event_value *val, buf; + int error; + + error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf)); + if (error) + return 0; + + val = bpf_map_lookup_elem(&values2, &key); + if (val) + *val = buf; + else + bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST); + + return 0; +} + char _license[] SEC("license") = "GPL"; u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c index a05a99a0752f..3341a96fc046 100644 --- a/samples/bpf/tracex6_user.c +++ b/samples/bpf/tracex6_user.c @@ -22,6 +22,7 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr) { + struct bpf_perf_event_value value2; int pmu_fd, error = 0; cpu_set_t set; __u64 value; @@ -46,8 +47,18 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr) fprintf(stderr, "Value missing for CPU %d\n", cpu); error = 1; goto on_exit; + } else { + fprintf(stderr, "CPU %d: %llu\n", cpu, value); + } + /* The above bpf_map_lookup_elem should trigger the second kprobe */ + if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) { + fprintf(stderr, "Value2 missing for CPU %d\n", cpu); + error = 1; + goto on_exit; + } else { + fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu, + value2.counter, value2.enabled, value2.running); } - fprintf(stderr, "CPU %d: %llu\n", cpu, value); on_exit: assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error); -- cgit v1.2.3 From 81b9cf8028a17bdbdaa0da80b735b32150d4e89e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:23 -0700 Subject: bpf: add a test case for helper bpf_perf_prog_read_value The bpf sample program trace_event is enhanced to use the new helper to print out enabled/running time. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/trace_event_kern.c | 10 ++++++++++ samples/bpf/trace_event_user.c | 13 ++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'samples') diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c index 41b6115a32eb..a77a583d94d4 100644 --- a/samples/bpf/trace_event_kern.c +++ b/samples/bpf/trace_event_kern.c @@ -37,10 +37,14 @@ struct bpf_map_def SEC("maps") stackmap = { SEC("perf_event") int bpf_prog1(struct bpf_perf_event_data *ctx) { + char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu"; + char time_fmt2[] = "Get Time Failed, ErrCode: %d"; char fmt[] = "CPU-%d period %lld ip %llx"; u32 cpu = bpf_get_smp_processor_id(); + struct bpf_perf_event_value value_buf; struct key_t key; u64 *val, one = 1; + int ret; if (ctx->sample_period < 10000) /* ignore warmup */ @@ -54,6 +58,12 @@ int bpf_prog1(struct bpf_perf_event_data *ctx) return 0; } + ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value)); + if (!ret) + bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running); + else + bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret); + val = bpf_map_lookup_elem(&counts, &key); if (val) (*val)++; diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c index 7bd827b84a67..bf4f1b6d9a52 100644 --- a/samples/bpf/trace_event_user.c +++ b/samples/bpf/trace_event_user.c @@ -127,6 +127,9 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr) int *pmu_fd = malloc(nr_cpus * sizeof(int)); int i, error = 0; + /* system wide perf event, no need to inherit */ + attr->inherit = 0; + /* open perf_event on all cpus */ for (i = 0; i < nr_cpus; i++) { pmu_fd[i] = sys_perf_event_open(attr, -1, i, -1, 0); @@ -154,6 +157,11 @@ static void test_perf_event_task(struct perf_event_attr *attr) { int pmu_fd; + /* per task perf event, enable inherit so the "dd ..." command can be traced properly. + * Enabling inherit will cause bpf_perf_prog_read_time helper failure. + */ + attr->inherit = 1; + /* open task bound event */ pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0); if (pmu_fd < 0) { @@ -175,14 +183,12 @@ static void test_bpf_perf_event(void) .freq = 1, .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, - .inherit = 1, }; struct perf_event_attr attr_type_sw = { .sample_freq = SAMPLE_FREQ, .freq = 1, .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_CLOCK, - .inherit = 1, }; struct perf_event_attr attr_hw_cache_l1d = { .sample_freq = SAMPLE_FREQ, @@ -192,7 +198,6 @@ static void test_bpf_perf_event(void) PERF_COUNT_HW_CACHE_L1D | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), - .inherit = 1, }; struct perf_event_attr attr_hw_cache_branch_miss = { .sample_freq = SAMPLE_FREQ, @@ -202,7 +207,6 @@ static void test_bpf_perf_event(void) PERF_COUNT_HW_CACHE_BPU | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), - .inherit = 1, }; struct perf_event_attr attr_type_raw = { .sample_freq = SAMPLE_FREQ, @@ -210,7 +214,6 @@ static void test_bpf_perf_event(void) .type = PERF_TYPE_RAW, /* Intel Instruction Retired */ .config = 0xc0, - .inherit = 1, }; printf("Test HW_CPU_CYCLES\n"); -- cgit v1.2.3 From 9db9583839b760fc492a7b288edfe2213184a579 Mon Sep 17 00:00:00 2001 From: Abhijit Ayarekar Date: Fri, 13 Oct 2017 12:24:06 -0700 Subject: bpf: Add -target to clang switch while cross compiling. Update to llvm excludes assembly instructions. llvm git revision is below commit 65fad7c26569 ("bpf: add inline-asm support") This change will be part of llvm release 6.0 __ASM_SYSREG_H define is not required for native compile. -target switch includes appropriate target specific files while cross compiling Tested on x86 and arm64. Signed-off-by: Abhijit Ayarekar Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index ebc2ad69b62c..81f9fcd736b7 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -180,6 +180,7 @@ CLANG ?= clang # Detect that we're cross compiling and use the cross compiler ifdef CROSS_COMPILE HOSTCC = $(CROSS_COMPILE)gcc +CLANG_ARCH_ARGS = -target $(ARCH) endif # Trick to allow make to be run from this directory @@ -229,9 +230,9 @@ $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h $(obj)/%.o: $(src)/%.c $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \ -I$(srctree)/tools/testing/selftests/bpf/ \ - -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \ + -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \ -D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \ -Wno-gnu-variable-sized-type-not-at-end \ -Wno-address-of-packed-member -Wno-tautological-compare \ - -Wno-unknown-warning-option \ + -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ -- cgit v1.2.3 From fad3917e361b115f776563366415ffb2fc706bf1 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:49 +0200 Subject: samples/bpf: add cpumap sample program xdp_redirect_cpu This sample program show how to use cpumap and the associated tracepoints. It provides command line stats, which shows how the XDP-RX process, cpumap-enqueue and cpumap kthread dequeue is cooperating on a per CPU basis. It also utilize the xdp_exception and xdp_redirect_err transpoints to allow users quickly to identify setup issues. One issue with ixgbe driver is that the driver reset the link when loading XDP. This reset the procfs smp_affinity settings. Thus, after loading the program, these must be reconfigured. The easiest workaround it to reduce the RX-queue to e.g. two via: # ethtool --set-channels ixgbe1 combined 2 And then add CPUs above 0 and 1, like: # xdp_redirect_cpu --dev ixgbe1 --prog 2 --cpu 2 --cpu 3 --cpu 4 Another issue with ixgbe is that the page recycle mechanism is tied to the RX-ring size. And the default setting of 512 elements is too small. This is the same issue with regular devmap XDP_REDIRECT. To overcome this I've been using 1024 rx-ring size: # ethtool -G ixgbe1 rx 1024 tx 1024 V3: - whitespace cleanups - bpf tracepoint cannot access top part of struct V4: - report on kthread sched events, according to tracepoint change - report average bulk enqueue size V5: - bpf_map_lookup_elem on cpumap not allowed from bpf_prog use separate map to mark CPUs not available V6: - correct kthread sched summary output V7: - Added a --stress-mode for concurrently changing underlying cpumap Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- samples/bpf/Makefile | 4 + samples/bpf/xdp_redirect_cpu_kern.c | 609 +++++++++++++++++++++++++++++++ samples/bpf/xdp_redirect_cpu_user.c | 697 ++++++++++++++++++++++++++++++++++++ 3 files changed, 1310 insertions(+) create mode 100644 samples/bpf/xdp_redirect_cpu_kern.c create mode 100644 samples/bpf/xdp_redirect_cpu_user.c (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 81f9fcd736b7..3534ccfb5920 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -39,6 +39,7 @@ hostprogs-y += per_socket_stats_example hostprogs-y += load_sock_ops hostprogs-y += xdp_redirect hostprogs-y += xdp_redirect_map +hostprogs-y += xdp_redirect_cpu hostprogs-y += xdp_monitor hostprogs-y += syscall_tp @@ -84,6 +85,7 @@ test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o +xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o @@ -129,6 +131,7 @@ always += tcp_iw_kern.o always += tcp_clamp_kern.o always += xdp_redirect_kern.o always += xdp_redirect_map_kern.o +always += xdp_redirect_cpu_kern.o always += xdp_monitor_kern.o always += syscall_tp_kern.o @@ -169,6 +172,7 @@ HOSTLOADLIBES_xdp_tx_iptunnel += -lelf HOSTLOADLIBES_test_map_in_map += -lelf HOSTLOADLIBES_xdp_redirect += -lelf HOSTLOADLIBES_xdp_redirect_map += -lelf +HOSTLOADLIBES_xdp_redirect_cpu += -lelf HOSTLOADLIBES_xdp_monitor += -lelf HOSTLOADLIBES_syscall_tp += -lelf diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c new file mode 100644 index 000000000000..303e9e7161f3 --- /dev/null +++ b/samples/bpf/xdp_redirect_cpu_kern.c @@ -0,0 +1,609 @@ +/* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP) + * + * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "bpf_helpers.h" + +#define MAX_CPUS 12 /* WARNING - sync with _user.c */ + +/* Special map type that can XDP_REDIRECT frames to another CPU */ +struct bpf_map_def SEC("maps") cpu_map = { + .type = BPF_MAP_TYPE_CPUMAP, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = MAX_CPUS, +}; + +/* Common stats data record to keep userspace more simple */ +struct datarec { + __u64 processed; + __u64 dropped; + __u64 issue; +}; + +/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success + * feedback. Redirect TX errors can be caught via a tracepoint. + */ +struct bpf_map_def SEC("maps") rx_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Used by trace point */ +struct bpf_map_def SEC("maps") redirect_err_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 2, + /* TODO: have entries for all possible errno's */ +}; + +/* Used by trace point */ +struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = MAX_CPUS, +}; + +/* Used by trace point */ +struct bpf_map_def SEC("maps") cpumap_kthread_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Set of maps controlling available CPU, and for iterating through + * selectable redirect CPUs. + */ +struct bpf_map_def SEC("maps") cpus_available = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = MAX_CPUS, +}; +struct bpf_map_def SEC("maps") cpus_count = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 1, +}; +struct bpf_map_def SEC("maps") cpus_iterator = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 1, +}; + +/* Used by trace point */ +struct bpf_map_def SEC("maps") exception_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Helper parse functions */ + +/* Parse Ethernet layer 2, extract network layer 3 offset and protocol + * + * Returns false on error and non-supported ether-type + */ +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +static __always_inline +bool parse_eth(struct ethhdr *eth, void *data_end, + u16 *eth_proto, u64 *l3_offset) +{ + u16 eth_type; + u64 offset; + + offset = sizeof(*eth); + if ((void *)eth + offset > data_end) + return false; + + eth_type = eth->h_proto; + + /* Skip non 802.3 Ethertypes */ + if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN)) + return false; + + /* Handle VLAN tagged packet */ + if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) { + struct vlan_hdr *vlan_hdr; + + vlan_hdr = (void *)eth + offset; + offset += sizeof(*vlan_hdr); + if ((void *)eth + offset > data_end) + return false; + eth_type = vlan_hdr->h_vlan_encapsulated_proto; + } + /* TODO: Handle double VLAN tagged packet */ + + *eth_proto = ntohs(eth_type); + *l3_offset = offset; + return true; +} + +static __always_inline +u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + struct udphdr *udph; + u16 dport; + + if (iph + 1 > data_end) + return 0; + if (!(iph->protocol == IPPROTO_UDP)) + return 0; + + udph = (void *)(iph + 1); + if (udph + 1 > data_end) + return 0; + + dport = ntohs(udph->dest); + return dport; +} + +static __always_inline +int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + return iph->protocol; +} + +static __always_inline +int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ipv6hdr *ip6h = data + nh_off; + + if (ip6h + 1 > data_end) + return 0; + return ip6h->nexthdr; +} + +SEC("xdp_cpu_map0") +int xdp_prognum0_no_touch(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct datarec *rec; + u32 *cpu_selected; + u32 cpu_dest; + u32 key = 0; + + /* Only use first entry in cpus_available */ + cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map1_touch_data") +int xdp_prognum1_touch_data(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + struct datarec *rec; + u32 *cpu_selected; + u32 cpu_dest; + u16 eth_type; + u32 key = 0; + + /* Only use first entry in cpus_available */ + cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + /* Validate packet length is minimum Eth header size */ + if (eth + 1 > data_end) + return XDP_ABORTED; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + /* Read packet data, and use it (drop non 802.3 Ethertypes) */ + eth_type = eth->h_proto; + if (ntohs(eth_type) < ETH_P_802_3_MIN) { + rec->dropped++; + return XDP_DROP; + } + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map2_round_robin") +int xdp_prognum2_round_robin(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + struct datarec *rec; + u32 cpu_dest; + u32 *cpu_lookup; + u32 key0 = 0; + + u32 *cpu_selected; + u32 *cpu_iterator; + u32 *cpu_max; + u32 cpu_idx; + + cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); + if (!cpu_max) + return XDP_ABORTED; + + cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0); + if (!cpu_iterator) + return XDP_ABORTED; + cpu_idx = *cpu_iterator; + + *cpu_iterator += 1; + if (*cpu_iterator == *cpu_max) + *cpu_iterator = 0; + + cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key0); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map3_proto_separate") +int xdp_prognum3_proto_separate(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u8 ip_proto = IPPROTO_UDP; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 cpu_idx = 0; + u32 *cpu_lookup; + u32 key = 0; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Extract L4 protocol */ + switch (eth_proto) { + case ETH_P_IP: + ip_proto = get_proto_ipv4(ctx, l3_offset); + break; + case ETH_P_IPV6: + ip_proto = get_proto_ipv6(ctx, l3_offset); + break; + case ETH_P_ARP: + cpu_idx = 0; /* ARP packet handled on separate CPU */ + break; + default: + cpu_idx = 0; + } + + /* Choose CPU based on L4 protocol */ + switch (ip_proto) { + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + cpu_idx = 2; + break; + case IPPROTO_TCP: + cpu_idx = 0; + break; + case IPPROTO_UDP: + cpu_idx = 1; + break; + default: + cpu_idx = 0; + } + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map4_ddos_filter_pktgen") +int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u8 ip_proto = IPPROTO_UDP; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 cpu_idx = 0; + u16 dest_port; + u32 *cpu_lookup; + u32 key = 0; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Extract L4 protocol */ + switch (eth_proto) { + case ETH_P_IP: + ip_proto = get_proto_ipv4(ctx, l3_offset); + break; + case ETH_P_IPV6: + ip_proto = get_proto_ipv6(ctx, l3_offset); + break; + case ETH_P_ARP: + cpu_idx = 0; /* ARP packet handled on separate CPU */ + break; + default: + cpu_idx = 0; + } + + /* Choose CPU based on L4 protocol */ + switch (ip_proto) { + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + cpu_idx = 2; + break; + case IPPROTO_TCP: + cpu_idx = 0; + break; + case IPPROTO_UDP: + cpu_idx = 1; + /* DDoS filter UDP port 9 (pktgen) */ + dest_port = get_dest_port_ipv4_udp(ctx, l3_offset); + if (dest_port == 9) { + if (rec) + rec->dropped++; + return XDP_DROP; + } + break; + default: + cpu_idx = 0; + } + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + + +char _license[] SEC("license") = "GPL"; + +/*** Trace point code ***/ + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_redirect_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12 size:4; signed:0; + int ifindex; // offset:16 size:4; signed:1; + int err; // offset:20 size:4; signed:1; + int to_ifindex; // offset:24 size:4; signed:1; + u32 map_id; // offset:28 size:4; signed:0; + int map_index; // offset:32 size:4; signed:1; +}; // offset:36 + +enum { + XDP_REDIRECT_SUCCESS = 0, + XDP_REDIRECT_ERROR = 1 +}; + +static __always_inline +int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) +{ + u32 key = XDP_REDIRECT_ERROR; + struct datarec *rec; + int err = ctx->err; + + if (!err) + key = XDP_REDIRECT_SUCCESS; + + rec = bpf_map_lookup_elem(&redirect_err_cnt, &key); + if (!rec) + return 0; + rec->dropped += 1; + + return 0; /* Indicate event was filtered (no further processing)*/ + /* + * Returning 1 here would allow e.g. a perf-record tracepoint + * to see and record these events, but it doesn't work well + * in-practice as stopping perf-record also unload this + * bpf_prog. Plus, there is additional overhead of doing so. + */ +} + +SEC("tracepoint/xdp/xdp_redirect_err") +int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + +SEC("tracepoint/xdp/xdp_redirect_map_err") +int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_exception_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int ifindex; // offset:16; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_exception") +int trace_xdp_exception(struct xdp_exception_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&exception_cnt, &key); + if (!rec) + return 1; + rec->dropped += 1; + + return 0; +} + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_enqueue_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int to_cpu; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_enqueue") +int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) +{ + u32 to_cpu = ctx->to_cpu; + struct datarec *rec; + + if (to_cpu >= MAX_CPUS) + return 1; + + rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Record bulk events, then userspace can calc average bulk size */ + if (ctx->processed > 0) + rec->issue += 1; + + /* Inception: It's possible to detect overload situations, via + * this tracepoint. This can be used for creating a feedback + * loop to XDP, which can take appropriate actions to mitigate + * this overload situation. + */ + return 0; +} + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_kthread_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int sched; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_kthread") +int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Count times kthread yielded CPU via schedule call */ + if (ctx->sched) + rec->issue++; + + return 0; +} diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c new file mode 100644 index 000000000000..35fec9fecb57 --- /dev/null +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -0,0 +1,697 @@ +/* GPLv2 Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. + */ +static const char *__doc__ = + " XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\""; + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define MAX_CPUS 12 /* WARNING - sync with _kern.c */ + +/* How many xdp_progs are defined in _kern.c */ +#define MAX_PROG 5 + +/* Wanted to get rid of bpf_load.h and fake-"libbpf.h" (and instead + * use bpf/libbpf.h), but cannot as (currently) needed for XDP + * attaching to a device via set_link_xdp_fd() + */ +#include "libbpf.h" +#include "bpf_load.h" + +#include "bpf_util.h" + +static int ifindex = -1; +static char ifname_buf[IF_NAMESIZE]; +static char *ifname; + +static __u32 xdp_flags; + +/* Exit return codes */ +#define EXIT_OK 0 +#define EXIT_FAIL 1 +#define EXIT_FAIL_OPTION 2 +#define EXIT_FAIL_XDP 3 +#define EXIT_FAIL_BPF 4 +#define EXIT_FAIL_MEM 5 + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"dev", required_argument, NULL, 'd' }, + {"skb-mode", no_argument, NULL, 'S' }, + {"debug", no_argument, NULL, 'D' }, + {"sec", required_argument, NULL, 's' }, + {"prognum", required_argument, NULL, 'p' }, + {"qsize", required_argument, NULL, 'q' }, + {"cpu", required_argument, NULL, 'c' }, + {"stress-mode", no_argument, NULL, 'x' }, + {"no-separators", no_argument, NULL, 'z' }, + {0, 0, NULL, 0 } +}; + +static void int_exit(int sig) +{ + fprintf(stderr, + "Interrupted: Removing XDP program on ifindex:%d device:%s\n", + ifindex, ifname); + if (ifindex > -1) + set_link_xdp_fd(ifindex, -1, xdp_flags); + exit(EXIT_OK); +} + +static void usage(char *argv[]) +{ + int i; + + printf("\nDOCUMENTATION:\n%s\n", __doc__); + printf("\n"); + printf(" Usage: %s (options-see-below)\n", argv[0]); + printf(" Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)", + *long_options[i].flag); + else + printf(" short-option: -%c", + long_options[i].val); + printf("\n"); + } + printf("\n"); +} + +/* gettime returns the current time of day in nanoseconds. + * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC) + * clock_gettime (ns) => 9ns (CLOCK_MONOTONIC_COARSE) + */ +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ +static __u64 gettime(void) +{ + struct timespec t; + int res; + + res = clock_gettime(CLOCK_MONOTONIC, &t); + if (res < 0) { + fprintf(stderr, "Error with gettimeofday! (%i)\n", res); + exit(EXIT_FAIL); + } + return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; +} + +/* Common stats data record shared with _kern.c */ +struct datarec { + __u64 processed; + __u64 dropped; + __u64 issue; +}; +struct record { + __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; +struct stats_record { + struct record rx_cnt; + struct record redir_err; + struct record kthread; + struct record exception; + struct record enq[MAX_CPUS]; +}; + +static bool map_collect_percpu(int fd, __u32 key, struct record *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec values[nr_cpus]; + __u64 sum_processed = 0; + __u64 sum_dropped = 0; + __u64 sum_issue = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_processed += values[i].processed; + rec->cpu[i].dropped = values[i].dropped; + sum_dropped += values[i].dropped; + rec->cpu[i].issue = values[i].issue; + sum_issue += values[i].issue; + } + rec->total.processed = sum_processed; + rec->total.dropped = sum_dropped; + rec->total.issue = sum_issue; + return true; +} + +static struct datarec *alloc_record_per_cpu(void) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec *array; + size_t size; + + size = sizeof(struct datarec) * nr_cpus; + array = malloc(size); + memset(array, 0, size); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct stats_record *alloc_stats_record(void) +{ + struct stats_record *rec; + int i; + + rec = malloc(sizeof(*rec)); + memset(rec, 0, sizeof(*rec)); + if (!rec) { + fprintf(stderr, "Mem alloc error\n"); + exit(EXIT_FAIL_MEM); + } + rec->rx_cnt.cpu = alloc_record_per_cpu(); + rec->redir_err.cpu = alloc_record_per_cpu(); + rec->kthread.cpu = alloc_record_per_cpu(); + rec->exception.cpu = alloc_record_per_cpu(); + for (i = 0; i < MAX_CPUS; i++) + rec->enq[i].cpu = alloc_record_per_cpu(); + + return rec; +} + +static void free_stats_record(struct stats_record *r) +{ + int i; + + for (i = 0; i < MAX_CPUS; i++) + free(r->enq[i].cpu); + free(r->exception.cpu); + free(r->kthread.cpu); + free(r->redir_err.cpu); + free(r->rx_cnt.cpu); + free(r); +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->processed - p->processed; + pps = packets / period_; + } + return pps; +} + +static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->dropped - p->dropped; + pps = packets / period_; + } + return pps; +} + +static __u64 calc_errs_pps(struct datarec *r, + struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->issue - p->issue; + pps = packets / period_; + } + return pps; +} + +static void stats_print(struct stats_record *stats_rec, + struct stats_record *stats_prev, + int prog_num) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + double pps = 0, drop = 0, err = 0; + struct record *rec, *prev; + int to_cpu; + double t; + int i; + + /* Header */ + printf("Running XDP/eBPF prog_num:%d\n", prog_num); + printf("%-15s %-7s %-14s %-11s %-9s\n", + "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info"); + + /* XDP rx_cnt */ + { + char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; + char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n"; + char *errstr = ""; + + rec = &stats_rec->rx_cnt; + prev = &stats_prev->rx_cnt; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) + errstr = "cpu-dest/err"; + if (pps > 0) + printf(fmt_rx, "XDP-RX", + i, pps, drop, err, errstr); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + printf(fm2_rx, "XDP-RX", "total", pps, drop); + } + + /* cpumap enqueue stats */ + for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { + char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; + char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; + char *errstr = ""; + + rec = &stats_rec->enq[to_cpu]; + prev = &stats_prev->enq[to_cpu]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) { + errstr = "bulk-average"; + err = pps / err; /* calc average bulk size */ + } + if (pps > 0) + printf(fmt, "cpumap-enqueue", + i, to_cpu, pps, drop, err, errstr); + } + pps = calc_pps(&rec->total, &prev->total, t); + if (pps > 0) { + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + if (err > 0) { + errstr = "bulk-average"; + err = pps / err; /* calc average bulk size */ + } + printf(fm2, "cpumap-enqueue", + "sum", to_cpu, pps, drop, err, errstr); + } + } + + /* cpumap kthread stats */ + { + char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; + char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n"; + char *e_str = ""; + + rec = &stats_rec->kthread; + prev = &stats_prev->kthread; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) + e_str = "sched"; + if (pps > 0) + printf(fmt_k, "cpumap_kthread", + i, pps, drop, err, e_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + if (err > 0) + e_str = "sched-sum"; + printf(fm2_k, "cpumap_kthread", "total", pps, drop, err, e_str); + } + + /* XDP redirect err tracepoints (very unlikely) */ + { + char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; + char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; + + rec = &stats_rec->redir_err; + prev = &stats_prev->redir_err; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + if (pps > 0) + printf(fmt_err, "redirect_err", i, pps, drop); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + printf(fm2_err, "redirect_err", "total", pps, drop); + } + + /* XDP general exception tracepoints */ + { + char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; + char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; + + rec = &stats_rec->exception; + prev = &stats_prev->exception; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + if (pps > 0) + printf(fmt_err, "xdp_exception", i, pps, drop); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + printf(fm2_err, "xdp_exception", "total", pps, drop); + } + + printf("\n"); + fflush(stdout); +} + +static void stats_collect(struct stats_record *rec) +{ + int fd, i; + + fd = map_fd[1]; /* map: rx_cnt */ + map_collect_percpu(fd, 0, &rec->rx_cnt); + + fd = map_fd[2]; /* map: redirect_err_cnt */ + map_collect_percpu(fd, 1, &rec->redir_err); + + fd = map_fd[3]; /* map: cpumap_enqueue_cnt */ + for (i = 0; i < MAX_CPUS; i++) + map_collect_percpu(fd, i, &rec->enq[i]); + + fd = map_fd[4]; /* map: cpumap_kthread_cnt */ + map_collect_percpu(fd, 0, &rec->kthread); + + fd = map_fd[8]; /* map: exception_cnt */ + map_collect_percpu(fd, 0, &rec->exception); +} + + +/* Pointer swap trick */ +static inline void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + +static int create_cpu_entry(__u32 cpu, __u32 queue_size, + __u32 avail_idx, bool new) +{ + __u32 curr_cpus_count = 0; + __u32 key = 0; + int ret; + + /* Add a CPU entry to cpumap, as this allocate a cpu entry in + * the kernel for the cpu. + */ + ret = bpf_map_update_elem(map_fd[0], &cpu, &queue_size, 0); + if (ret) { + fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret); + exit(EXIT_FAIL_BPF); + } + + /* Inform bpf_prog's that a new CPU is available to select + * from via some control maps. + */ + /* map_fd[5] = cpus_available */ + ret = bpf_map_update_elem(map_fd[5], &avail_idx, &cpu, 0); + if (ret) { + fprintf(stderr, "Add to avail CPUs failed\n"); + exit(EXIT_FAIL_BPF); + } + + /* When not replacing/updating existing entry, bump the count */ + /* map_fd[6] = cpus_count */ + ret = bpf_map_lookup_elem(map_fd[6], &key, &curr_cpus_count); + if (ret) { + fprintf(stderr, "Failed reading curr cpus_count\n"); + exit(EXIT_FAIL_BPF); + } + if (new) { + curr_cpus_count++; + ret = bpf_map_update_elem(map_fd[6], &key, &curr_cpus_count, 0); + if (ret) { + fprintf(stderr, "Failed write curr cpus_count\n"); + exit(EXIT_FAIL_BPF); + } + } + /* map_fd[7] = cpus_iterator */ + printf("%s CPU:%u as idx:%u queue_size:%d (total cpus_count:%u)\n", + new ? "Add-new":"Replace", cpu, avail_idx, + queue_size, curr_cpus_count); + + return 0; +} + +/* CPUs are zero-indexed. Thus, add a special sentinel default value + * in map cpus_available to mark CPU index'es not configured + */ +static void mark_cpus_unavailable(void) +{ + __u32 invalid_cpu = MAX_CPUS; + int ret, i; + + for (i = 0; i < MAX_CPUS; i++) { + /* map_fd[5] = cpus_available */ + ret = bpf_map_update_elem(map_fd[5], &i, &invalid_cpu, 0); + if (ret) { + fprintf(stderr, "Failed marking CPU unavailable\n"); + exit(EXIT_FAIL_BPF); + } + } +} + +/* Stress cpumap management code by concurrently changing underlying cpumap */ +static void stress_cpumap(void) +{ + /* Changing qsize will cause kernel to free and alloc a new + * bpf_cpu_map_entry, with an associated/complicated tear-down + * procedure. + */ + create_cpu_entry(1, 1024, 0, false); + create_cpu_entry(1, 128, 0, false); + create_cpu_entry(1, 16000, 0, false); +} + +static void stats_poll(int interval, bool use_separators, int prog_num, + bool stress_mode) +{ + struct stats_record *record, *prev; + + record = alloc_stats_record(); + prev = alloc_stats_record(); + stats_collect(record); + + /* Trick to pretty printf with thousands separators use %' */ + if (use_separators) + setlocale(LC_NUMERIC, "en_US"); + + while (1) { + swap(&prev, &record); + stats_collect(record); + stats_print(record, prev, prog_num); + sleep(interval); + if (stress_mode) + stress_cpumap(); + } + + free_stats_record(record); + free_stats_record(prev); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + bool use_separators = true; + bool stress_mode = false; + char filename[256]; + bool debug = false; + int added_cpus = 0; + int longindex = 0; + int interval = 2; + int prog_num = 0; + int add_cpu = -1; + __u32 qsize; + int opt; + + /* Notice: choosing he queue size is very important with the + * ixgbe driver, because it's driver page recycling trick is + * dependend on pages being returned quickly. The number of + * out-standing packets in the system must be less-than 2x + * RX-ring size. + */ + qsize = 128+64; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + if (load_bpf_file(filename)) { + fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf); + return EXIT_FAIL; + } + + if (!prog_fd[0]) { + fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno)); + return EXIT_FAIL; + } + + mark_cpus_unavailable(); + + /* Parse commands line args */ + while ((opt = getopt_long(argc, argv, "hSd:", + long_options, &longindex)) != -1) { + switch (opt) { + case 'd': + if (strlen(optarg) >= IF_NAMESIZE) { + fprintf(stderr, "ERR: --dev name too long\n"); + goto error; + } + ifname = (char *)&ifname_buf; + strncpy(ifname, optarg, IF_NAMESIZE); + ifindex = if_nametoindex(ifname); + if (ifindex == 0) { + fprintf(stderr, + "ERR: --dev name unknown err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + break; + case 's': + interval = atoi(optarg); + break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'D': + debug = true; + break; + case 'x': + stress_mode = true; + break; + case 'z': + use_separators = false; + break; + case 'p': + /* Selecting eBPF prog to load */ + prog_num = atoi(optarg); + if (prog_num < 0 || prog_num >= MAX_PROG) { + fprintf(stderr, + "--prognum too large err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + break; + case 'c': + /* Add multiple CPUs */ + add_cpu = strtoul(optarg, NULL, 0); + if (add_cpu >= MAX_CPUS) { + fprintf(stderr, + "--cpu nr too large for cpumap err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + create_cpu_entry(add_cpu, qsize, added_cpus, true); + added_cpus++; + break; + case 'q': + qsize = atoi(optarg); + break; + case 'h': + error: + default: + usage(argv); + return EXIT_FAIL_OPTION; + } + } + /* Required option */ + if (ifindex == -1) { + fprintf(stderr, "ERR: required option --dev missing\n"); + usage(argv); + return EXIT_FAIL_OPTION; + } + /* Required option */ + if (add_cpu == -1) { + fprintf(stderr, "ERR: required option --cpu missing\n"); + fprintf(stderr, " Specify multiple --cpu option to add more\n"); + usage(argv); + return EXIT_FAIL_OPTION; + } + + /* Remove XDP program when program is interrupted */ + signal(SIGINT, int_exit); + + if (set_link_xdp_fd(ifindex, prog_fd[prog_num], xdp_flags) < 0) { + fprintf(stderr, "link set xdp fd failed\n"); + return EXIT_FAIL_XDP; + } + + if (debug) { + printf("Debug-mode reading trace pipe (fix #define DEBUG)\n"); + read_trace_pipe(); + } + + stats_poll(interval, use_separators, prog_num, stress_mode); + return EXIT_OK; +} -- cgit v1.2.3 From c890063e440456e75c2e70f6bcec3797f1771eb6 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:42 -0700 Subject: bpf: sample BPF_SOCKET_OPS_BASE_RTT program Sample socket_ops BPF program to test the BPF helper function bpf_getsocketops and the new socket_ops op BPF_SOCKET_OPS_BASE_RTT. The program provides a base RTT of 80us when the calling flow is within a DC (as determined by the IPV6 prefix) and the congestion algorithm is "nv". Signed-off-by: Lawrence Brakmo Acked-by: Daniel Borkmann Acked_by: Alexei Starovoitov Signed-off-by: David S. Miller --- samples/bpf/Makefile | 1 + samples/bpf/tcp_basertt_kern.c | 78 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 samples/bpf/tcp_basertt_kern.c (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 3534ccfb5920..ea2b9e6135f3 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -129,6 +129,7 @@ always += tcp_bufs_kern.o always += tcp_cong_kern.o always += tcp_iw_kern.o always += tcp_clamp_kern.o +always += tcp_basertt_kern.o always += xdp_redirect_kern.o always += xdp_redirect_map_kern.o always += xdp_redirect_cpu_kern.o diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c new file mode 100644 index 000000000000..4bf4fc597db9 --- /dev/null +++ b/samples/bpf/tcp_basertt_kern.c @@ -0,0 +1,78 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set base_rtt to 80us when host is running TCP-NV and + * both hosts are in the same datacenter (as determined by IPv6 prefix). + * + * Use load_sock_ops to load this BPF program. + */ + +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define DEBUG 1 + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +SEC("sockops") +int bpf_basertt(struct bpf_sock_ops *skops) +{ + char cong[20]; + char nv[] = "nv"; + int rv = 0, n; + int op; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check if both hosts are in the same datacenter. For this + * example they are if the 1st 5.5 bytes in the IPv6 address + * are the same. + */ + if (skops->family == AF_INET6 && + skops->local_ip6[0] == skops->remote_ip6[0] && + (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == + (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) { + switch (op) { + case BPF_SOCK_OPS_BASE_RTT: + n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION, + cong, sizeof(cong)); + if (!n && !__builtin_memcmp(cong, nv, sizeof(nv)+1)) { + /* Set base_rtt to 80us */ + rv = 80; + } else if (n) { + rv = n; + } else { + rv = -1; + } + break; + default: + rv = -1; + } + } else { + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From bfdf75693875fd53d6f08d5fec9506a864f07372 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 20 Oct 2017 11:05:43 -0700 Subject: bpf: create samples/bpf/tcp_bpf.readme Readme file explaining how to create a cgroupv2 and attach one of the tcp_*_kern.o socket_ops BPF program. Signed-off-by: Lawrence Brakmo Acked-by: Daniel Borkmann Acked_by: Alexei Starovoitov Signed-off-by: David S. Miller --- samples/bpf/tcp_bbf.readme | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 samples/bpf/tcp_bbf.readme (limited to 'samples') diff --git a/samples/bpf/tcp_bbf.readme b/samples/bpf/tcp_bbf.readme new file mode 100644 index 000000000000..831fb601e3c9 --- /dev/null +++ b/samples/bpf/tcp_bbf.readme @@ -0,0 +1,26 @@ +This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops) +programs. These programs attach to a cgroupv2. The following commands create +a cgroupv2 and attach a bash shell to the group. + + mkdir -p /tmp/cgroupv2 + mount -t cgroup2 none /tmp/cgroupv2 + mkdir -p /tmp/cgroupv2/foo + bash + echo $$ >> /tmp/cgroupv2/foo/cgroup.procs + +Anything that runs under this shell belongs to the foo cgroupv2 To load +(attach) one of the tcp_*_kern.o programs: + + ./load_sock_ops -l /tmp/cgroupv2/foo tcp_basertt_kern.o + +If the "-l" flag is used, the load_sock_ops program will continue to run +printing the BPF log buffer. The tcp_*_kern.o programs use special print +functions to print logging information (if enabled by the ifdef). + +If using netperf/netserver to create traffic, you need to run them under the +cgroupv2 to which the BPF programs are attached (i.e. under bash shell +attached to the cgroupv2). + +To remove (unattach) a socket_ops BPF program from a cgroupv2: + + ./load_sock_ops -r /tmp/cgroupv2/foo -- cgit v1.2.3 From a678be5cc747bafc8c66f2fe00a103422587a5eb Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 23 Oct 2017 23:53:09 -0700 Subject: bpf: add a test case to test single tp multiple bpf attachment The bpf sample program syscall_tp is modified to show attachment of more than bpf programs for a particular kernel tracepoint. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- samples/bpf/syscall_tp_user.c | 66 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 13 deletions(-) (limited to 'samples') diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c index a3cb91ebf4e7..9169d3207f18 100644 --- a/samples/bpf/syscall_tp_user.c +++ b/samples/bpf/syscall_tp_user.c @@ -23,6 +23,13 @@ * This requires kernel CONFIG_FTRACE_SYSCALLS to be set. */ +static void usage(const char *cmd) +{ + printf("USAGE: %s [-i num_progs] [-h]\n", cmd); + printf(" -i num_progs # number of progs of the test\n"); + printf(" -h # help\n"); +} + static void verify_map(int map_id) { __u32 key = 0; @@ -32,22 +39,29 @@ static void verify_map(int map_id) fprintf(stderr, "map_lookup failed: %s\n", strerror(errno)); return; } - if (val == 0) + if (val == 0) { fprintf(stderr, "failed: map #%d returns value 0\n", map_id); + return; + } + val = 0; + if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) { + fprintf(stderr, "map_update failed: %s\n", strerror(errno)); + return; + } } -int main(int argc, char **argv) +static int test(char *filename, int num_progs) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - char filename[256]; - int fd; + int i, fd, map0_fds[num_progs], map1_fds[num_progs]; - setrlimit(RLIMIT_MEMLOCK, &r); - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - if (load_bpf_file(filename)) { - fprintf(stderr, "%s", bpf_log_buf); - return 1; + for (i = 0; i < num_progs; i++) { + if (load_bpf_file(filename)) { + fprintf(stderr, "%s", bpf_log_buf); + return 1; + } + printf("prog #%d: map ids %d %d\n", i, map_fd[0], map_fd[1]); + map0_fds[i] = map_fd[0]; + map1_fds[i] = map_fd[1]; } /* current load_bpf_file has perf_event_open default pid = -1 @@ -64,8 +78,34 @@ int main(int argc, char **argv) close(fd); /* verify the map */ - verify_map(map_fd[0]); - verify_map(map_fd[1]); + for (i = 0; i < num_progs; i++) { + verify_map(map0_fds[i]); + verify_map(map1_fds[i]); + } return 0; } + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int opt, num_progs = 1; + char filename[256]; + + while ((opt = getopt(argc, argv, "i:h")) != -1) { + switch (opt) { + case 'i': + num_progs = atoi(optarg); + break; + case 'h': + default: + usage(argv[0]); + return 0; + } + } + + setrlimit(RLIMIT_MEMLOCK, &r); + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + return test(filename, num_progs); +} -- cgit v1.2.3 From 6dfca831c03ef654b1f7bff1b8d487d330e9f76b Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Fri, 27 Oct 2017 16:12:30 -0700 Subject: samples/bpf: adjust rlimit RLIMIT_MEMLOCK for xdp1 Default rlimit RLIMIT_MEMLOCK is 64KB, causes bpf map failure. e.g. [root@lab bpf]#./xdp1 -N $( Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- samples/bpf/xdp1_user.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'samples') diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c index 2431c0321b71..fdaefe91801d 100644 --- a/samples/bpf/xdp1_user.c +++ b/samples/bpf/xdp1_user.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "bpf_load.h" #include "bpf_util.h" @@ -69,6 +70,7 @@ static void usage(const char *prog) int main(int argc, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; const char *optstr = "SN"; char filename[256]; int opt; @@ -91,6 +93,12 @@ int main(int argc, char **argv) usage(basename(argv[0])); return 1; } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + ifindex = strtoul(argv[optind], NULL, 0); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); -- cgit v1.2.3 From 21d72af7dcf0c9f78f4fbdb93315568731014e66 Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Fri, 27 Oct 2017 17:28:22 -0700 Subject: samples/bpf: adjust rlimit RLIMIT_MEMLOCK for xdp_redirect_map Default rlimit RLIMIT_MEMLOCK is 64KB, causes bpf map failure. e.g. [root@labbpf]# ./xdp_redirect_map $( $( Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- samples/bpf/xdp_redirect_map_user.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'samples') diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index d4d86a273fba..978a532f0748 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "bpf_load.h" #include "bpf_util.h" @@ -74,6 +75,7 @@ static void usage(const char *prog) int main(int argc, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; const char *optstr = "SN"; char filename[256]; int ret, opt, key = 0; @@ -97,6 +99,11 @@ int main(int argc, char **argv) return 1; } + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + ifindex_in = strtoul(argv[optind], NULL, 0); ifindex_out = strtoul(argv[optind + 1], NULL, 0); printf("input: %d output: %d\n", ifindex_in, ifindex_out); -- cgit v1.2.3 From 22ac5ad4a7d4e201d19b7f04ce8d79346c80a34b Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Wed, 1 Nov 2017 11:41:09 +0100 Subject: samples/pktgen: Add some helper functions 1. given a device, get its NUMA belongings 2. given a device, get its queues' irq numbers. 3. given a NUMA node, get its cpu id list. Signed-off-by: Robert Hoo Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- samples/pktgen/functions.sh | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'samples') diff --git a/samples/pktgen/functions.sh b/samples/pktgen/functions.sh index 205e4cde4601..f8bb3cd0f4ce 100644 --- a/samples/pktgen/functions.sh +++ b/samples/pktgen/functions.sh @@ -119,3 +119,46 @@ function root_check_run_with_sudo() { err 4 "cannot perform sudo run of $0" fi } + +# Exact input device's NUMA node info +function get_iface_node() +{ + local node=$( Date: Wed, 1 Nov 2017 11:41:14 +0100 Subject: samples/pktgen: add script pktgen_sample06_numa_awared_queue_irq_affinity.sh This script simply does: * Detect $DEV's NUMA node belonging. * Bind each thread (processor of NUMA locality) with each $DEV queue's irq affinity, 1:1 mapping. * How many '-t' threads input determines how many queues will be utilized. If '-f' designates first cpu id, then offset in the NUMA node's cpu list. (Changes by Jesper: allow changing count from cmdline via '-n') Signed-off-by: Robert Hoo Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- ...tgen_sample06_numa_awared_queue_irq_affinity.sh | 97 ++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100755 samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh (limited to 'samples') diff --git a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh new file mode 100755 index 000000000000..353adc17205e --- /dev/null +++ b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# +# Multiqueue: Using pktgen threads for sending on multiple CPUs +# * adding devices to kernel threads which are in the same NUMA node +# * bound devices queue's irq affinity to the threads, 1:1 mapping +# * notice the naming scheme for keeping device names unique +# * nameing scheme: dev@thread_number +# * flow variation via random UDP source port +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" +# +# Required param: -i dev in $DEV +source ${basedir}/parameters.sh + +# Base Config +DELAY="0" # Zero means max speed +[ -z "$COUNT" ] && COUNT="20000000" # Zero means indefinitely +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" + +# Flow variation random source port between min and max +UDP_MIN=9 +UDP_MAX=109 + +node=`get_iface_node $DEV` +irq_array=(`get_iface_irqs $DEV`) +cpu_array=(`get_node_cpus $node`) + +[ $THREADS -gt ${#irq_array[*]} -o $THREADS -gt ${#cpu_array[*]} ] && \ + err 1 "Thread number $THREADS exceeds: min (${#irq_array[*]},${#cpu_array[*]})" + +# (example of setting default params in your script) +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi +[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" + +# General cleanup everything since last run +pg_ctrl "reset" + +# Threads are specified with parameter -t value in $THREADS +for ((i = 0; i < $THREADS; i++)); do + # The device name is extended with @name, using thread number to + # make then unique, but any name will do. + # Set the queue's irq affinity to this $thread (processor) + # if '-f' is designated, offset cpu id + thread=${cpu_array[$((i+F_THREAD))]} + dev=${DEV}@${thread} + echo $thread > /proc/irq/${irq_array[$i]}/smp_affinity_list + info "irq ${irq_array[$i]} is set affinity to `cat /proc/irq/${irq_array[$i]}/smp_affinity_list`" + + # Add remove all other devices and add_device $dev to thread + pg_thread $thread "rem_device_all" + pg_thread $thread "add_device" $dev + + # select queue and bind the queue and $dev in 1:1 relationship + queue_num=$i + info "queue number is $queue_num" + pg_set $dev "queue_map_min $queue_num" + pg_set $dev "queue_map_max $queue_num" + + # Notice config queue to map to cpu (mirrors smp_processor_id()) + # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number + pg_set $dev "flag QUEUE_MAP_CPU" + + # Base config of dev + pg_set $dev "count $COUNT" + pg_set $dev "clone_skb $CLONE_SKB" + pg_set $dev "pkt_size $PKT_SIZE" + pg_set $dev "delay $DELAY" + + # Flag example disabling timestamping + pg_set $dev "flag NO_TIMESTAMP" + + # Destination + pg_set $dev "dst_mac $DST_MAC" + pg_set $dev "dst$IP6 $DEST_IP" + + # Setup random UDP port src range + pg_set $dev "flag UDPSRC_RND" + pg_set $dev "udp_src_min $UDP_MIN" + pg_set $dev "udp_src_max $UDP_MAX" +done + +# start_run +echo "Running... ctrl^C to stop" >&2 +pg_ctrl "start" +echo "Done" >&2 + +# Print results +for ((i = 0; i < $THREADS; i++)); do + thread=${cpu_array[$((i+F_THREAD))]} + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" +done -- cgit v1.2.3 From 9efc44d74b586218e923e3dafb3462d21948c5c6 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 1 Nov 2017 11:41:19 +0100 Subject: samples/pktgen: update sample03, no need for clones when bursting Like sample05, don't use pktgen clone_skb feature when using 'burst' feature, it is not really needed. This brings the burst users in sync. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- samples/pktgen/pktgen_sample03_burst_single_flow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'samples') diff --git a/samples/pktgen/pktgen_sample03_burst_single_flow.sh b/samples/pktgen/pktgen_sample03_burst_single_flow.sh index 8d26e0ca683d..8a46daf27966 100755 --- a/samples/pktgen/pktgen_sample03_burst_single_flow.sh +++ b/samples/pktgen/pktgen_sample03_burst_single_flow.sh @@ -30,7 +30,7 @@ if [ -z "$DEST_IP" ]; then fi [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" [ -z "$BURST" ] && BURST=32 -[ -z "$CLONE_SKB" ] && CLONE_SKB="100000" +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" # No need for clones when bursting [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely # Base Config -- cgit v1.2.3 From a4b6ade8359fc265fb4f8691fea33f4eaa66c951 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 1 Nov 2017 11:41:24 +0100 Subject: samples/pktgen: remove remaining old pktgen sample scripts Since commit 0f06a6787e05 ("samples: Add an IPv6 '-6' option to the pktgen scripts") the newer pktgen_sampleXX script does show howto use IPv6 with pktgen. Thus, there is no longer a reason to keep the older sample scripts around. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- samples/pktgen/pktgen.conf-1-1-ip6 | 60 ---------------------------- samples/pktgen/pktgen.conf-1-1-ip6-rdos | 63 ------------------------------ samples/pktgen/pktgen.conf-1-2 | 69 --------------------------------- 3 files changed, 192 deletions(-) delete mode 100755 samples/pktgen/pktgen.conf-1-1-ip6 delete mode 100755 samples/pktgen/pktgen.conf-1-1-ip6-rdos delete mode 100755 samples/pktgen/pktgen.conf-1-2 (limited to 'samples') diff --git a/samples/pktgen/pktgen.conf-1-1-ip6 b/samples/pktgen/pktgen.conf-1-1-ip6 deleted file mode 100755 index 0b9ffd47fd41..000000000000 --- a/samples/pktgen/pktgen.conf-1-1-ip6 +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -#modprobe pktgen - - -function pgset() { - local result - - echo $1 > $PGDEV - - result=`cat $PGDEV | fgrep "Result: OK:"` - if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: - fi -} - -# Config Start Here ----------------------------------------------------------- - - -# thread config -# Each CPU has its own thread. One CPU example. We add eth1. -# IPv6. Note increase in minimal packet length - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - - -# device config -# delay 0 - -CLONE_SKB="clone_skb 1000000" -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 66" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 10000000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst6 fec0::1" - pgset "src6 fec0::2" - pgset "dst_mac 00:04:23:08:91:dc" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1 diff --git a/samples/pktgen/pktgen.conf-1-1-ip6-rdos b/samples/pktgen/pktgen.conf-1-1-ip6-rdos deleted file mode 100755 index ad98e5f40776..000000000000 --- a/samples/pktgen/pktgen.conf-1-1-ip6-rdos +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash - -#modprobe pktgen - - -function pgset() { - local result - - echo $1 > $PGDEV - - result=`cat $PGDEV | fgrep "Result: OK:"` - if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: - fi -} - -# Config Start Here ----------------------------------------------------------- - - -# thread config -# Each CPU has its own thread. One CPU example. We add eth1. -# IPv6. Note increase in minimal packet length - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - - -# device config -# delay 0 means maximum speed. - -# We need to do alloc for every skb since we cannot clone here. -CLONE_SKB="clone_skb 0" - -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 66" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 10000000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst6_min fec0::1" - pgset "dst6_max fec0::FFFF:FFFF" - - pgset "dst_mac 00:04:23:08:91:dc" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1 diff --git a/samples/pktgen/pktgen.conf-1-2 b/samples/pktgen/pktgen.conf-1-2 deleted file mode 100755 index ba4eb26e168d..000000000000 --- a/samples/pktgen/pktgen.conf-1-2 +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash - -#modprobe pktgen - - -function pgset() { - local result - - echo $1 > $PGDEV - - result=`cat $PGDEV | fgrep "Result: OK:"` - if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: - fi -} - -# Config Start Here ----------------------------------------------------------- - - -# thread config -# One CPU means one thread. One CPU example. We add eth1, eth2 respectivly. - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - echo "Adding eth2" - pgset "add_device eth2" - - -# device config -# delay 0 means maximum speed. - -CLONE_SKB="clone_skb 1000000" -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 60" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 10000000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst 10.10.11.2" - pgset "dst_mac 00:04:23:08:91:dc" - -PGDEV=/proc/net/pktgen/eth2 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst 192.168.2.2" - pgset "dst_mac 00:04:23:08:91:de" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1 /proc/net/pktgen/eth2 -- cgit v1.2.3 From 9d1f15941967cd80fc3baa3322751fab532f98a4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sun, 5 Nov 2017 08:15:33 -0500 Subject: bpf: move cgroup_helpers from samples/bpf/ to tools/testing/selftesting/bpf/ The purpose of this move is to use these files in bpf tests. Signed-off-by: Roman Gushchin Acked-by: Alexei Starovoitov Acked-by: Tejun Heo Cc: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/Makefile | 5 +- samples/bpf/cgroup_helpers.c | 178 ------------------------------------------- samples/bpf/cgroup_helpers.h | 17 ----- 3 files changed, 3 insertions(+), 197 deletions(-) delete mode 100644 samples/bpf/cgroup_helpers.c delete mode 100644 samples/bpf/cgroup_helpers.h (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 6a9321ec348a..5994075b080d 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -46,6 +46,7 @@ hostprogs-y += syscall_tp # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o +CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o test_lru_dist-objs := test_lru_dist.o $(LIBBPF) sock_example-objs := sock_example.o $(LIBBPF) @@ -69,13 +70,13 @@ map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o test_cgrp2_array_pin-objs := $(LIBBPF) test_cgrp2_array_pin.o test_cgrp2_attach-objs := $(LIBBPF) test_cgrp2_attach.o -test_cgrp2_attach2-objs := $(LIBBPF) test_cgrp2_attach2.o cgroup_helpers.o +test_cgrp2_attach2-objs := $(LIBBPF) test_cgrp2_attach2.o $(CGROUP_HELPERS) test_cgrp2_sock-objs := $(LIBBPF) test_cgrp2_sock.o test_cgrp2_sock2-objs := bpf_load.o $(LIBBPF) test_cgrp2_sock2.o xdp1-objs := bpf_load.o $(LIBBPF) xdp1_user.o # reuse xdp1 source intentionally xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o -test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) cgroup_helpers.o \ +test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \ test_current_task_under_cgroup_user.o trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o diff --git a/samples/bpf/cgroup_helpers.c b/samples/bpf/cgroup_helpers.c deleted file mode 100644 index f3bca3ade0f3..000000000000 --- a/samples/bpf/cgroup_helpers.c +++ /dev/null @@ -1,178 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "cgroup_helpers.h" - -/* - * To avoid relying on the system setup, when setup_cgroup_env is called - * we create a new mount namespace, and cgroup namespace. The cgroup2 - * root is mounted at CGROUP_MOUNT_PATH - * - * Unfortunately, most people don't have cgroupv2 enabled at this point in time. - * It's easier to create our own mount namespace and manage it ourselves. - * - * We assume /mnt exists. - */ - -#define WALK_FD_LIMIT 16 -#define CGROUP_MOUNT_PATH "/mnt" -#define CGROUP_WORK_DIR "/cgroup-test-work-dir" -#define format_cgroup_path(buf, path) \ - snprintf(buf, sizeof(buf), "%s%s%s", CGROUP_MOUNT_PATH, \ - CGROUP_WORK_DIR, path) - -/** - * setup_cgroup_environment() - Setup the cgroup environment - * - * After calling this function, cleanup_cgroup_environment should be called - * once testing is complete. - * - * This function will print an error to stderr and return 1 if it is unable - * to setup the cgroup environment. If setup is successful, 0 is returned. - */ -int setup_cgroup_environment(void) -{ - char cgroup_workdir[PATH_MAX + 1]; - - format_cgroup_path(cgroup_workdir, ""); - - if (unshare(CLONE_NEWNS)) { - log_err("unshare"); - return 1; - } - - if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) { - log_err("mount fakeroot"); - return 1; - } - - if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL) && errno != EBUSY) { - log_err("mount cgroup2"); - return 1; - } - - /* Cleanup existing failed runs, now that the environment is setup */ - cleanup_cgroup_environment(); - - if (mkdir(cgroup_workdir, 0777) && errno != EEXIST) { - log_err("mkdir cgroup work dir"); - return 1; - } - - return 0; -} - -static int nftwfunc(const char *filename, const struct stat *statptr, - int fileflags, struct FTW *pfwt) -{ - if ((fileflags & FTW_D) && rmdir(filename)) - log_err("Removing cgroup: %s", filename); - return 0; -} - - -static int join_cgroup_from_top(char *cgroup_path) -{ - char cgroup_procs_path[PATH_MAX + 1]; - pid_t pid = getpid(); - int fd, rc = 0; - - snprintf(cgroup_procs_path, sizeof(cgroup_procs_path), - "%s/cgroup.procs", cgroup_path); - - fd = open(cgroup_procs_path, O_WRONLY); - if (fd < 0) { - log_err("Opening Cgroup Procs: %s", cgroup_procs_path); - return 1; - } - - if (dprintf(fd, "%d\n", pid) < 0) { - log_err("Joining Cgroup"); - rc = 1; - } - - close(fd); - return rc; -} - -/** - * join_cgroup() - Join a cgroup - * @path: The cgroup path, relative to the workdir, to join - * - * This function expects a cgroup to already be created, relative to the cgroup - * work dir, and it joins it. For example, passing "/my-cgroup" as the path - * would actually put the calling process into the cgroup - * "/cgroup-test-work-dir/my-cgroup" - * - * On success, it returns 0, otherwise on failure it returns 1. - */ -int join_cgroup(char *path) -{ - char cgroup_path[PATH_MAX + 1]; - - format_cgroup_path(cgroup_path, path); - return join_cgroup_from_top(cgroup_path); -} - -/** - * cleanup_cgroup_environment() - Cleanup Cgroup Testing Environment - * - * This is an idempotent function to delete all temporary cgroups that - * have been created during the test, including the cgroup testing work - * directory. - * - * At call time, it moves the calling process to the root cgroup, and then - * runs the deletion process. It is idempotent, and should not fail, unless - * a process is lingering. - * - * On failure, it will print an error to stderr, and try to continue. - */ -void cleanup_cgroup_environment(void) -{ - char cgroup_workdir[PATH_MAX + 1]; - - format_cgroup_path(cgroup_workdir, ""); - join_cgroup_from_top(CGROUP_MOUNT_PATH); - nftw(cgroup_workdir, nftwfunc, WALK_FD_LIMIT, FTW_DEPTH | FTW_MOUNT); -} - -/** - * create_and_get_cgroup() - Create a cgroup, relative to workdir, and get the FD - * @path: The cgroup path, relative to the workdir, to join - * - * This function creates a cgroup under the top level workdir and returns the - * file descriptor. It is idempotent. - * - * On success, it returns the file descriptor. On failure it returns 0. - * If there is a failure, it prints the error to stderr. - */ -int create_and_get_cgroup(char *path) -{ - char cgroup_path[PATH_MAX + 1]; - int fd; - - format_cgroup_path(cgroup_path, path); - if (mkdir(cgroup_path, 0777) && errno != EEXIST) { - log_err("mkdiring cgroup %s .. %s", path, cgroup_path); - return 0; - } - - fd = open(cgroup_path, O_RDONLY); - if (fd < 0) { - log_err("Opening Cgroup"); - return 0; - } - - return fd; -} diff --git a/samples/bpf/cgroup_helpers.h b/samples/bpf/cgroup_helpers.h deleted file mode 100644 index 06485e0002b3..000000000000 --- a/samples/bpf/cgroup_helpers.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __CGROUP_HELPERS_H -#define __CGROUP_HELPERS_H -#include -#include - -#define clean_errno() (errno == 0 ? "None" : strerror(errno)) -#define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \ - __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) - - -int create_and_get_cgroup(char *path); -int join_cgroup(char *path); -int setup_cgroup_environment(void); -void cleanup_cgroup_environment(void); - -#endif -- cgit v1.2.3 From 3e29cd0e6563d5fefd59e7225750ee9922f2dad5 Mon Sep 17 00:00:00 2001 From: Christina Jacob Date: Sun, 5 Nov 2017 08:52:30 +0530 Subject: xdp: Sample xdp program implementing ip forward Implements port to port forwarding with route table and arp table lookup for ipv4 packets using bpf_redirect helper function and lpm_trie map. Signed-off-by: Christina Jacob Signed-off-by: David S. Miller --- samples/bpf/Makefile | 4 + samples/bpf/xdp_router_ipv4_kern.c | 186 +++++++++++ samples/bpf/xdp_router_ipv4_user.c | 659 +++++++++++++++++++++++++++++++++++++ 3 files changed, 849 insertions(+) create mode 100644 samples/bpf/xdp_router_ipv4_kern.c create mode 100644 samples/bpf/xdp_router_ipv4_user.c (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 5994075b080d..3b4945c1eab0 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -29,6 +29,7 @@ hostprogs-y += test_cgrp2_sock hostprogs-y += test_cgrp2_sock2 hostprogs-y += xdp1 hostprogs-y += xdp2 +hostprogs-y += xdp_router_ipv4 hostprogs-y += test_current_task_under_cgroup hostprogs-y += trace_event hostprogs-y += sampleip @@ -76,6 +77,7 @@ test_cgrp2_sock2-objs := bpf_load.o $(LIBBPF) test_cgrp2_sock2.o xdp1-objs := bpf_load.o $(LIBBPF) xdp1_user.o # reuse xdp1 source intentionally xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o +xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \ test_current_task_under_cgroup_user.o trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o @@ -118,6 +120,7 @@ always += parse_varlen.o parse_simple.o parse_ldabs.o always += test_cgrp2_tc_kern.o always += xdp1_kern.o always += xdp2_kern.o +always += xdp_router_ipv4_kern.o always += test_current_task_under_cgroup_kern.o always += trace_event_kern.o always += sampleip_kern.o @@ -166,6 +169,7 @@ HOSTLOADLIBES_map_perf_test += -lelf -lrt HOSTLOADLIBES_test_overhead += -lelf -lrt HOSTLOADLIBES_xdp1 += -lelf HOSTLOADLIBES_xdp2 += -lelf +HOSTLOADLIBES_xdp_router_ipv4 += -lelf HOSTLOADLIBES_test_current_task_under_cgroup += -lelf HOSTLOADLIBES_trace_event += -lelf HOSTLOADLIBES_sampleip += -lelf diff --git a/samples/bpf/xdp_router_ipv4_kern.c b/samples/bpf/xdp_router_ipv4_kern.c new file mode 100644 index 000000000000..993f56bc7b9a --- /dev/null +++ b/samples/bpf/xdp_router_ipv4_kern.c @@ -0,0 +1,186 @@ +/* Copyright (C) 2017 Cavium, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include +#include + +struct trie_value { + __u8 prefix[4]; + __be64 value; + int ifindex; + int metric; + __be32 gw; +}; + +/* Key for lpm_trie*/ +union key_4 { + u32 b32[2]; + u8 b8[8]; +}; + +struct arp_entry { + __be64 mac; + __be32 dst; +}; + +struct direct_map { + struct arp_entry arp; + int ifindex; + __be64 mac; +}; + +/* Map for trie implementation*/ +struct bpf_map_def SEC("maps") lpm_map = { + .type = BPF_MAP_TYPE_LPM_TRIE, + .key_size = 8, + .value_size = sizeof(struct trie_value), + .max_entries = 50, + .map_flags = BPF_F_NO_PREALLOC, +}; + +/* Map for counter*/ +struct bpf_map_def SEC("maps") rxcnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = 256, +}; + +/* Map for ARP table*/ +struct bpf_map_def SEC("maps") arp_table = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(__be32), + .value_size = sizeof(__be64), + .max_entries = 50, +}; + +/* Map to keep the exact match entries in the route table*/ +struct bpf_map_def SEC("maps") exact_match = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(__be32), + .value_size = sizeof(struct direct_map), + .max_entries = 50, +}; + +struct bpf_map_def SEC("maps") tx_port = { + .type = BPF_MAP_TYPE_DEVMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 100, +}; + +/* Function to set source and destination mac of the packet */ +static inline void set_src_dst_mac(void *data, void *src, void *dst) +{ + unsigned short *source = src; + unsigned short *dest = dst; + unsigned short *p = data; + + __builtin_memcpy(p, dest, 6); + __builtin_memcpy(p + 3, source, 6); +} + +/* Parse IPV4 packet to get SRC, DST IP and protocol */ +static inline int parse_ipv4(void *data, u64 nh_off, void *data_end, + __be32 *src, __be32 *dest) +{ + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + *src = iph->saddr; + *dest = iph->daddr; + return iph->protocol; +} + +SEC("xdp_router_ipv4") +int xdp_router_ipv4_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + __be64 *dest_mac = NULL, *src_mac = NULL; + void *data = (void *)(long)ctx->data; + struct trie_value *prefix_value; + int rc = XDP_DROP, forward_to; + struct ethhdr *eth = data; + union key_4 key4; + long *value; + u16 h_proto; + u32 ipproto; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return rc; + + h_proto = eth->h_proto; + + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + if (h_proto == htons(ETH_P_ARP)) { + return XDP_PASS; + } else if (h_proto == htons(ETH_P_IP)) { + struct direct_map *direct_entry; + __be32 src_ip = 0, dest_ip = 0; + + ipproto = parse_ipv4(data, nh_off, data_end, &src_ip, &dest_ip); + direct_entry = bpf_map_lookup_elem(&exact_match, &dest_ip); + /* Check for exact match, this would give a faster lookup*/ + if (direct_entry && direct_entry->mac && direct_entry->arp.mac) { + src_mac = &direct_entry->mac; + dest_mac = &direct_entry->arp.mac; + forward_to = direct_entry->ifindex; + } else { + /* Look up in the trie for lpm*/ + key4.b32[0] = 32; + key4.b8[4] = dest_ip & 0xff; + key4.b8[5] = (dest_ip >> 8) & 0xff; + key4.b8[6] = (dest_ip >> 16) & 0xff; + key4.b8[7] = (dest_ip >> 24) & 0xff; + prefix_value = bpf_map_lookup_elem(&lpm_map, &key4); + if (!prefix_value) + return XDP_DROP; + src_mac = &prefix_value->value; + if (!src_mac) + return XDP_DROP; + dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip); + if (!dest_mac) { + if (!prefix_value->gw) + return XDP_DROP; + dest_ip = prefix_value->gw; + dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip); + } + forward_to = prefix_value->ifindex; + } + } else { + ipproto = 0; + } + if (src_mac && dest_mac) { + set_src_dst_mac(data, src_mac, dest_mac); + value = bpf_map_lookup_elem(&rxcnt, &ipproto); + if (value) + *value += 1; + return bpf_redirect_map(&tx_port, forward_to, 0); + } + return rc; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c new file mode 100644 index 000000000000..2c1fe3f4b1a4 --- /dev/null +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -0,0 +1,659 @@ +/* Copyright (C) 2017 Cavium, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf_load.h" +#include "libbpf.h" +#include +#include +#include +#include +#include +#include +#include +#include "bpf_util.h" + +int sock, sock_arp, flags = 0; +static int total_ifindex; +int *ifindex_list; +char buf[8192]; + +static int get_route_table(int rtm_family); +static void int_exit(int sig) +{ + int i = 0; + + for (i = 0; i < total_ifindex; i++) + set_link_xdp_fd(ifindex_list[i], -1, flags); + exit(0); +} + +static void close_and_exit(int sig) +{ + int i = 0; + + close(sock); + close(sock_arp); + + for (i = 0; i < total_ifindex; i++) + set_link_xdp_fd(ifindex_list[i], -1, flags); + exit(0); +} + +/* Get the mac address of the interface given interface name */ +static __be64 getmac(char *iface) +{ + struct ifreq ifr; + __be64 mac = 0; + int fd, i; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + ifr.ifr_addr.sa_family = AF_INET; + strncpy(ifr.ifr_name, iface, IFNAMSIZ - 1); + if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) { + printf("ioctl failed leaving....\n"); + return -1; + } + for (i = 0; i < 6 ; i++) + *((__u8 *)&mac + i) = (__u8)ifr.ifr_hwaddr.sa_data[i]; + close(fd); + return mac; +} + +static int recv_msg(struct sockaddr_nl sock_addr, int sock) +{ + struct nlmsghdr *nh; + int len, nll = 0; + char *buf_ptr; + + buf_ptr = buf; + while (1) { + len = recv(sock, buf_ptr, sizeof(buf) - nll, 0); + if (len < 0) + return len; + + nh = (struct nlmsghdr *)buf_ptr; + + if (nh->nlmsg_type == NLMSG_DONE) + break; + buf_ptr += len; + nll += len; + if ((sock_addr.nl_groups & RTMGRP_NEIGH) == RTMGRP_NEIGH) + break; + + if ((sock_addr.nl_groups & RTMGRP_IPV4_ROUTE) == RTMGRP_IPV4_ROUTE) + break; + } + return nll; +} + +/* Function to parse the route entry returned by netlink + * Updates the route entry related map entries + */ +static void read_route(struct nlmsghdr *nh, int nll) +{ + char dsts[24], gws[24], ifs[16], dsts_len[24], metrics[24]; + struct bpf_lpm_trie_key *prefix_key; + struct rtattr *rt_attr; + struct rtmsg *rt_msg; + int rtm_family; + int rtl; + int i; + struct route_table { + int dst_len, iface, metric; + char *iface_name; + __be32 dst, gw; + __be64 mac; + } route; + struct arp_table { + __be64 mac; + __be32 dst; + }; + + struct direct_map { + struct arp_table arp; + int ifindex; + __be64 mac; + } direct_entry; + + if (nh->nlmsg_type == RTM_DELROUTE) + printf("DELETING Route entry\n"); + else if (nh->nlmsg_type == RTM_GETROUTE) + printf("READING Route entry\n"); + else if (nh->nlmsg_type == RTM_NEWROUTE) + printf("NEW Route entry\n"); + else + printf("%d\n", nh->nlmsg_type); + + memset(&route, 0, sizeof(route)); + printf("Destination\t\tGateway\t\tGenmask\t\tMetric\t\tIface\n"); + for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { + rt_msg = (struct rtmsg *)NLMSG_DATA(nh); + rtm_family = rt_msg->rtm_family; + if (rtm_family == AF_INET) + if (rt_msg->rtm_table != RT_TABLE_MAIN) + continue; + rt_attr = (struct rtattr *)RTM_RTA(rt_msg); + rtl = RTM_PAYLOAD(nh); + + for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) { + switch (rt_attr->rta_type) { + case NDA_DST: + sprintf(dsts, "%u", + (*((__be32 *)RTA_DATA(rt_attr)))); + break; + case RTA_GATEWAY: + sprintf(gws, "%u", + *((__be32 *)RTA_DATA(rt_attr))); + break; + case RTA_OIF: + sprintf(ifs, "%u", + *((int *)RTA_DATA(rt_attr))); + break; + case RTA_METRICS: + sprintf(metrics, "%u", + *((int *)RTA_DATA(rt_attr))); + default: + break; + } + } + sprintf(dsts_len, "%d", rt_msg->rtm_dst_len); + route.dst = atoi(dsts); + route.dst_len = atoi(dsts_len); + route.gw = atoi(gws); + route.iface = atoi(ifs); + route.metric = atoi(metrics); + route.iface_name = alloca(sizeof(char *) * IFNAMSIZ); + route.iface_name = if_indextoname(route.iface, route.iface_name); + route.mac = getmac(route.iface_name); + if (route.mac == -1) { + int i = 0; + + for (i = 0; i < total_ifindex; i++) + set_link_xdp_fd(ifindex_list[i], -1, flags); + exit(0); + } + assert(bpf_map_update_elem(map_fd[4], &route.iface, &route.iface, 0) == 0); + if (rtm_family == AF_INET) { + struct trie_value { + __u8 prefix[4]; + __be64 value; + int ifindex; + int metric; + __be32 gw; + } *prefix_value; + + prefix_key = alloca(sizeof(*prefix_key) + 3); + prefix_value = alloca(sizeof(*prefix_value)); + + prefix_key->prefixlen = 32; + prefix_key->prefixlen = route.dst_len; + direct_entry.mac = route.mac & 0xffffffffffff; + direct_entry.ifindex = route.iface; + direct_entry.arp.mac = 0; + direct_entry.arp.dst = 0; + if (route.dst_len == 32) { + if (nh->nlmsg_type == RTM_DELROUTE) + assert(bpf_map_delete_elem(map_fd[3], &route.dst) == 0); + else + if (bpf_map_lookup_elem(map_fd[2], &route.dst, &direct_entry.arp.mac) == 0) + direct_entry.arp.dst = route.dst; + assert(bpf_map_update_elem(map_fd[3], &route.dst, &direct_entry, 0) == 0); + } + for (i = 0; i < 4; i++) + prefix_key->data[i] = (route.dst >> i * 8) & 0xff; + + printf("%3d.%d.%d.%d\t\t%3x\t\t%d\t\t%d\t\t%s\n", + (int)prefix_key->data[0], + (int)prefix_key->data[1], + (int)prefix_key->data[2], + (int)prefix_key->data[3], + route.gw, route.dst_len, + route.metric, + route.iface_name); + if (bpf_map_lookup_elem(map_fd[0], prefix_key, + prefix_value) < 0) { + for (i = 0; i < 4; i++) + prefix_value->prefix[i] = prefix_key->data[i]; + prefix_value->value = route.mac & 0xffffffffffff; + prefix_value->ifindex = route.iface; + prefix_value->gw = route.gw; + prefix_value->metric = route.metric; + + assert(bpf_map_update_elem(map_fd[0], + prefix_key, + prefix_value, 0 + ) == 0); + } else { + if (nh->nlmsg_type == RTM_DELROUTE) { + printf("deleting entry\n"); + printf("prefix key=%d.%d.%d.%d/%d", + prefix_key->data[0], + prefix_key->data[1], + prefix_key->data[2], + prefix_key->data[3], + prefix_key->prefixlen); + assert(bpf_map_delete_elem(map_fd[0], + prefix_key + ) == 0); + /* Rereading the route table to check if + * there is an entry with the same + * prefix but a different metric as the + * deleted enty. + */ + get_route_table(AF_INET); + } else if (prefix_key->data[0] == + prefix_value->prefix[0] && + prefix_key->data[1] == + prefix_value->prefix[1] && + prefix_key->data[2] == + prefix_value->prefix[2] && + prefix_key->data[3] == + prefix_value->prefix[3] && + route.metric >= prefix_value->metric) { + continue; + } else { + for (i = 0; i < 4; i++) + prefix_value->prefix[i] = + prefix_key->data[i]; + prefix_value->value = + route.mac & 0xffffffffffff; + prefix_value->ifindex = route.iface; + prefix_value->gw = route.gw; + prefix_value->metric = route.metric; + assert(bpf_map_update_elem( + map_fd[0], + prefix_key, + prefix_value, + 0) == 0); + } + } + } + memset(&route, 0, sizeof(route)); + memset(dsts, 0, sizeof(dsts)); + memset(dsts_len, 0, sizeof(dsts_len)); + memset(gws, 0, sizeof(gws)); + memset(ifs, 0, sizeof(ifs)); + memset(&route, 0, sizeof(route)); + } +} + +/* Function to read the existing route table when the process is launched*/ +static int get_route_table(int rtm_family) +{ + struct sockaddr_nl sa; + struct nlmsghdr *nh; + int sock, seq = 0; + struct msghdr msg; + struct iovec iov; + int ret = 0; + int nll; + + struct { + struct nlmsghdr nl; + struct rtmsg rt; + char buf[8192]; + } req; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + memset(&req, 0, sizeof(req)); + req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + req.nl.nlmsg_type = RTM_GETROUTE; + + req.rt.rtm_family = rtm_family; + req.rt.rtm_table = RT_TABLE_MAIN; + req.nl.nlmsg_pid = 0; + req.nl.nlmsg_seq = ++seq; + memset(&msg, 0, sizeof(msg)); + iov.iov_base = (void *)&req.nl; + iov.iov_len = req.nl.nlmsg_len; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + ret = sendmsg(sock, &msg, 0); + if (ret < 0) { + printf("send to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + memset(buf, 0, sizeof(buf)); + nll = recv_msg(sa, sock); + if (nll < 0) { + printf("recv from netlink: %s\n", strerror(nll)); + ret = -1; + goto cleanup; + } + nh = (struct nlmsghdr *)buf; + read_route(nh, nll); +cleanup: + close(sock); + return ret; +} + +/* Function to parse the arp entry returned by netlink + * Updates the arp entry related map entries + */ +static void read_arp(struct nlmsghdr *nh, int nll) +{ + struct rtattr *rt_attr; + char dsts[24], mac[24]; + struct ndmsg *rt_msg; + int rtl, ndm_family; + + struct arp_table { + __be64 mac; + __be32 dst; + } arp_entry; + struct direct_map { + struct arp_table arp; + int ifindex; + __be64 mac; + } direct_entry; + + if (nh->nlmsg_type == RTM_GETNEIGH) + printf("READING arp entry\n"); + printf("Address\tHwAddress\n"); + for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { + rt_msg = (struct ndmsg *)NLMSG_DATA(nh); + rt_attr = (struct rtattr *)RTM_RTA(rt_msg); + ndm_family = rt_msg->ndm_family; + rtl = RTM_PAYLOAD(nh); + for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) { + switch (rt_attr->rta_type) { + case NDA_DST: + sprintf(dsts, "%u", + *((__be32 *)RTA_DATA(rt_attr))); + break; + case NDA_LLADDR: + sprintf(mac, "%lld", + *((__be64 *)RTA_DATA(rt_attr))); + break; + default: + break; + } + } + arp_entry.dst = atoi(dsts); + arp_entry.mac = atol(mac); + printf("%x\t\t%llx\n", arp_entry.dst, arp_entry.mac); + if (ndm_family == AF_INET) { + if (bpf_map_lookup_elem(map_fd[3], &arp_entry.dst, + &direct_entry) == 0) { + if (nh->nlmsg_type == RTM_DELNEIGH) { + direct_entry.arp.dst = 0; + direct_entry.arp.mac = 0; + } else if (nh->nlmsg_type == RTM_NEWNEIGH) { + direct_entry.arp.dst = arp_entry.dst; + direct_entry.arp.mac = arp_entry.mac; + } + assert(bpf_map_update_elem(map_fd[3], + &arp_entry.dst, + &direct_entry, 0 + ) == 0); + memset(&direct_entry, 0, sizeof(direct_entry)); + } + if (nh->nlmsg_type == RTM_DELNEIGH) { + assert(bpf_map_delete_elem(map_fd[2], &arp_entry.dst) == 0); + } else if (nh->nlmsg_type == RTM_NEWNEIGH) { + assert(bpf_map_update_elem(map_fd[2], + &arp_entry.dst, + &arp_entry.mac, 0 + ) == 0); + } + } + memset(&arp_entry, 0, sizeof(arp_entry)); + memset(dsts, 0, sizeof(dsts)); + } +} + +/* Function to read the existing arp table when the process is launched*/ +static int get_arp_table(int rtm_family) +{ + struct sockaddr_nl sa; + struct nlmsghdr *nh; + int sock, seq = 0; + struct msghdr msg; + struct iovec iov; + int ret = 0; + int nll; + struct { + struct nlmsghdr nl; + struct ndmsg rt; + char buf[8192]; + } req; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + memset(&req, 0, sizeof(req)); + req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + req.nl.nlmsg_type = RTM_GETNEIGH; + req.rt.ndm_state = NUD_REACHABLE; + req.rt.ndm_family = rtm_family; + req.nl.nlmsg_pid = 0; + req.nl.nlmsg_seq = ++seq; + memset(&msg, 0, sizeof(msg)); + iov.iov_base = (void *)&req.nl; + iov.iov_len = req.nl.nlmsg_len; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + ret = sendmsg(sock, &msg, 0); + if (ret < 0) { + printf("send to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + memset(buf, 0, sizeof(buf)); + nll = recv_msg(sa, sock); + if (nll < 0) { + printf("recv from netlink: %s\n", strerror(nll)); + ret = -1; + goto cleanup; + } + nh = (struct nlmsghdr *)buf; + read_arp(nh, nll); +cleanup: + close(sock); + return ret; +} + +/* Function to keep track and update changes in route and arp table + * Give regular statistics of packets forwarded + */ +static int monitor_route(void) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + const unsigned int nr_keys = 256; + struct pollfd fds_route, fds_arp; + __u64 prev[nr_keys][nr_cpus]; + struct sockaddr_nl la, lr; + __u64 values[nr_cpus]; + struct nlmsghdr *nh; + int nll, ret = 0; + int interval = 5; + __u32 key; + int i; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + + fcntl(sock, F_SETFL, O_NONBLOCK); + memset(&lr, 0, sizeof(lr)); + lr.nl_family = AF_NETLINK; + lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY; + if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + fds_route.fd = sock; + fds_route.events = POLL_IN; + + sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock_arp < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + + fcntl(sock_arp, F_SETFL, O_NONBLOCK); + memset(&la, 0, sizeof(la)); + la.nl_family = AF_NETLINK; + la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY; + if (bind(sock_arp, (struct sockaddr *)&la, sizeof(la)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + fds_arp.fd = sock_arp; + fds_arp.events = POLL_IN; + + memset(prev, 0, sizeof(prev)); + do { + signal(SIGINT, close_and_exit); + signal(SIGTERM, close_and_exit); + + sleep(interval); + for (key = 0; key < nr_keys; key++) { + __u64 sum = 0; + + assert(bpf_map_lookup_elem(map_fd[1], &key, values) == 0); + for (i = 0; i < nr_cpus; i++) + sum += (values[i] - prev[key][i]); + if (sum) + printf("proto %u: %10llu pkt/s\n", + key, sum / interval); + memcpy(prev[key], values, sizeof(values)); + } + + memset(buf, 0, sizeof(buf)); + if (poll(&fds_route, 1, 3) == POLL_IN) { + nll = recv_msg(lr, sock); + if (nll < 0) { + printf("recv from netlink: %s\n", strerror(nll)); + ret = -1; + goto cleanup; + } + + nh = (struct nlmsghdr *)buf; + printf("Routing table updated.\n"); + read_route(nh, nll); + } + memset(buf, 0, sizeof(buf)); + if (poll(&fds_arp, 1, 3) == POLL_IN) { + nll = recv_msg(la, sock_arp); + if (nll < 0) { + printf("recv from netlink: %s\n", strerror(nll)); + ret = -1; + goto cleanup; + } + + nh = (struct nlmsghdr *)buf; + read_arp(nh, nll); + } + + } while (1); +cleanup: + close(sock); + return ret; +} + +int main(int ac, char **argv) +{ + char filename[256]; + char **ifname_list; + int i = 1; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (ac < 2) { + printf("usage: %s [-S] Interface name list\n", argv[0]); + return 1; + } + if (!strcmp(argv[1], "-S")) { + flags = XDP_FLAGS_SKB_MODE; + total_ifindex = ac - 2; + ifname_list = (argv + 2); + } else { + flags = 0; + total_ifindex = ac - 1; + ifname_list = (argv + 1); + } + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + printf("\n**************loading bpf file*********************\n\n\n"); + if (!prog_fd[0]) { + printf("load_bpf_file: %s\n", strerror(errno)); + return 1; + } + ifindex_list = (int *)malloc(total_ifindex * sizeof(int *)); + for (i = 0; i < total_ifindex; i++) { + ifindex_list[i] = if_nametoindex(ifname_list[i]); + if (!ifindex_list[i]) { + printf("Couldn't translate interface name: %s", + strerror(errno)); + return 1; + } + } + for (i = 0; i < total_ifindex; i++) { + if (set_link_xdp_fd(ifindex_list[i], prog_fd[0], flags) < 0) { + printf("link set xdp fd failed\n"); + int recovery_index = i; + + for (i = 0; i < recovery_index; i++) + set_link_xdp_fd(ifindex_list[i], -1, flags); + + return 1; + } + printf("Attached to %d\n", ifindex_list[i]); + } + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + printf("*******************ROUTE TABLE*************************\n\n\n"); + get_route_table(AF_INET); + printf("*******************ARP TABLE***************************\n\n\n"); + get_arp_table(AF_INET); + if (monitor_route() < 0) { + printf("Error in receiving route update"); + return 1; + } + + return 0; +} -- cgit v1.2.3 From aaf151b9e68101b03ba42d581e8a424bdd0110fe Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Sun, 5 Nov 2017 18:44:10 -0800 Subject: bpf: Rename tcp_bbf.readme to tcp_bpf.readme The original patch had the wrong filename. Fixes: bfdf75693875 ("bpf: create samples/bpf/tcp_bpf.readme") Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/tcp_bbf.readme | 26 -------------------------- samples/bpf/tcp_bpf.readme | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 26 deletions(-) delete mode 100644 samples/bpf/tcp_bbf.readme create mode 100644 samples/bpf/tcp_bpf.readme (limited to 'samples') diff --git a/samples/bpf/tcp_bbf.readme b/samples/bpf/tcp_bbf.readme deleted file mode 100644 index 831fb601e3c9..000000000000 --- a/samples/bpf/tcp_bbf.readme +++ /dev/null @@ -1,26 +0,0 @@ -This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops) -programs. These programs attach to a cgroupv2. The following commands create -a cgroupv2 and attach a bash shell to the group. - - mkdir -p /tmp/cgroupv2 - mount -t cgroup2 none /tmp/cgroupv2 - mkdir -p /tmp/cgroupv2/foo - bash - echo $$ >> /tmp/cgroupv2/foo/cgroup.procs - -Anything that runs under this shell belongs to the foo cgroupv2 To load -(attach) one of the tcp_*_kern.o programs: - - ./load_sock_ops -l /tmp/cgroupv2/foo tcp_basertt_kern.o - -If the "-l" flag is used, the load_sock_ops program will continue to run -printing the BPF log buffer. The tcp_*_kern.o programs use special print -functions to print logging information (if enabled by the ifdef). - -If using netperf/netserver to create traffic, you need to run them under the -cgroupv2 to which the BPF programs are attached (i.e. under bash shell -attached to the cgroupv2). - -To remove (unattach) a socket_ops BPF program from a cgroupv2: - - ./load_sock_ops -r /tmp/cgroupv2/foo diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme new file mode 100644 index 000000000000..831fb601e3c9 --- /dev/null +++ b/samples/bpf/tcp_bpf.readme @@ -0,0 +1,26 @@ +This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops) +programs. These programs attach to a cgroupv2. The following commands create +a cgroupv2 and attach a bash shell to the group. + + mkdir -p /tmp/cgroupv2 + mount -t cgroup2 none /tmp/cgroupv2 + mkdir -p /tmp/cgroupv2/foo + bash + echo $$ >> /tmp/cgroupv2/foo/cgroup.procs + +Anything that runs under this shell belongs to the foo cgroupv2 To load +(attach) one of the tcp_*_kern.o programs: + + ./load_sock_ops -l /tmp/cgroupv2/foo tcp_basertt_kern.o + +If the "-l" flag is used, the load_sock_ops program will continue to run +printing the BPF log buffer. The tcp_*_kern.o programs use special print +functions to print logging information (if enabled by the ifdef). + +If using netperf/netserver to create traffic, you need to run them under the +cgroupv2 to which the BPF programs are attached (i.e. under bash shell +attached to the cgroupv2). + +To remove (unattach) a socket_ops BPF program from a cgroupv2: + + ./load_sock_ops -r /tmp/cgroupv2/foo -- cgit v1.2.3 From eafb3401faf243f7dca0e23325242cb8c2269ee9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 7 Nov 2017 15:28:43 -0500 Subject: samples/bpf: add a test for bpf_override_return This adds a basic test for bpf_override_return to verify it works. We override the main function for mounting a btrfs fs so it'll return -ENOMEM and then make sure that trying to mount a btrfs fs will fail. Acked-by: Alexei Starovoitov Signed-off-by: Josef Bacik Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/Makefile | 4 ++++ samples/bpf/test_override_return.sh | 15 +++++++++++++++ samples/bpf/tracex7_kern.c | 16 ++++++++++++++++ samples/bpf/tracex7_user.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+) create mode 100755 samples/bpf/test_override_return.sh create mode 100644 samples/bpf/tracex7_kern.c create mode 100644 samples/bpf/tracex7_user.c (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 3b4945c1eab0..87db0f9a4c15 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -15,6 +15,7 @@ hostprogs-y += tracex3 hostprogs-y += tracex4 hostprogs-y += tracex5 hostprogs-y += tracex6 +hostprogs-y += tracex7 hostprogs-y += test_probe_write_user hostprogs-y += trace_output hostprogs-y += lathist @@ -61,6 +62,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o +tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o @@ -104,6 +106,7 @@ always += tracex3_kern.o always += tracex4_kern.o always += tracex5_kern.o always += tracex6_kern.o +always += tracex7_kern.o always += sock_flags_kern.o always += test_probe_write_user_kern.o always += trace_output_kern.o @@ -158,6 +161,7 @@ HOSTLOADLIBES_tracex3 += -lelf HOSTLOADLIBES_tracex4 += -lelf -lrt HOSTLOADLIBES_tracex5 += -lelf HOSTLOADLIBES_tracex6 += -lelf +HOSTLOADLIBES_tracex7 += -lelf HOSTLOADLIBES_test_cgrp2_sock2 += -lelf HOSTLOADLIBES_load_sock_ops += -lelf HOSTLOADLIBES_test_probe_write_user += -lelf diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh new file mode 100755 index 000000000000..e68b9ee6814b --- /dev/null +++ b/samples/bpf/test_override_return.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +rm -f testfile.img +dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 +DEVICE=$(losetup --show -f testfile.img) +mkfs.btrfs -f $DEVICE +mkdir tmpmnt +./tracex7 $DEVICE +if [ $? -eq 0 ] +then + echo "SUCCESS!" +else + echo "FAILED!" +fi +losetup -d $DEVICE diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c new file mode 100644 index 000000000000..1ab308a43e0f --- /dev/null +++ b/samples/bpf/tracex7_kern.c @@ -0,0 +1,16 @@ +#include +#include +#include +#include "bpf_helpers.h" + +SEC("kprobe/open_ctree") +int bpf_prog1(struct pt_regs *ctx) +{ + unsigned long rc = -12; + + bpf_override_return(ctx, rc); + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c new file mode 100644 index 000000000000..8a52ac492e8b --- /dev/null +++ b/samples/bpf/tracex7_user.c @@ -0,0 +1,28 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" + +int main(int argc, char **argv) +{ + FILE *f; + char filename[256]; + char command[256]; + int ret; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + snprintf(command, 256, "mount %s tmpmnt/", argv[1]); + f = popen(command, "r"); + ret = pclose(f); + + return ret ? 0 : 1; +} -- cgit v1.2.3 From 7863f46bac3a1716f7d547c53f367ddf509f031e Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 10 Nov 2017 22:19:50 -0800 Subject: bpf: Fix tcp_synrto_kern.c sample program The program was returning -1 in some cases which is not allowed by the verifier any longer. Fixes: 390ee7e29fc8 ("bpf: enforce return code for cgroup-bpf programs") Signed-off-by: Lawrence Brakmo Signed-off-by: David S. Miller --- samples/bpf/tcp_synrto_kern.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'samples') diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c index 3c3fc83d81cb..232bb242823e 100644 --- a/samples/bpf/tcp_synrto_kern.c +++ b/samples/bpf/tcp_synrto_kern.c @@ -38,8 +38,10 @@ int bpf_synrto(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != 55601 && - skops->local_port != 55601) - return -1; + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; -- cgit v1.2.3 From 016e661bb0610a98b1c9ac1250e3269236fabe19 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 10 Nov 2017 22:19:51 -0800 Subject: bpf: Fix tcp_rwnd_kern.c sample program The program was returning -1 in some cases which is not allowed by the verifier any longer. Fixes: 390ee7e29fc8 ("bpf: enforce return code for cgroup-bpf programs") Signed-off-by: Lawrence Brakmo Signed-off-by: David S. Miller --- samples/bpf/tcp_rwnd_kern.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'samples') diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c index 3f2a228f81ce..09ff65b40b31 100644 --- a/samples/bpf/tcp_rwnd_kern.c +++ b/samples/bpf/tcp_rwnd_kern.c @@ -38,8 +38,10 @@ int bpf_rwnd(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != - 55601 && skops->local_port != 55601) - return -1; + 55601 && skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; -- cgit v1.2.3 From a4174f0560f849317239478b1b22afbf03a6eda2 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 10 Nov 2017 22:19:52 -0800 Subject: bpf: Fix tcp_bufs_kern.c sample program The program was returning -1 in some cases which is not allowed by the verifier any longer. Fixes: 390ee7e29fc8 ("bpf: enforce return code for cgroup-bpf programs") Signed-off-by: Lawrence Brakmo Signed-off-by: David S. Miller --- samples/bpf/tcp_bufs_kern.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'samples') diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c index ee83bbabd17c..0566b7fa38a1 100644 --- a/samples/bpf/tcp_bufs_kern.c +++ b/samples/bpf/tcp_bufs_kern.c @@ -41,8 +41,10 @@ int bpf_bufs(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != 55601 && - skops->local_port != 55601) - return -1; + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; @@ -61,8 +63,8 @@ int bpf_bufs(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of active connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, - &bufsize, sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: /* Nothing to do */ @@ -71,8 +73,8 @@ int bpf_bufs(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of passive connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, - &bufsize, sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); break; default: rv = -1; -- cgit v1.2.3 From 2ff969fbe2bfa4486b66226917352d4bb12ec1cb Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 10 Nov 2017 22:19:53 -0800 Subject: bpf: Fix tcp_cong_kern.c sample program The program was returning -1 in some cases which is not allowed by the verifier any longer. Fixes: 390ee7e29fc8 ("bpf: enforce return code for cgroup-bpf programs") Signed-off-by: Lawrence Brakmo Signed-off-by: David S. Miller --- samples/bpf/tcp_cong_kern.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'samples') diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c index dac15bce1fa9..ad0f1ba8206a 100644 --- a/samples/bpf/tcp_cong_kern.c +++ b/samples/bpf/tcp_cong_kern.c @@ -39,8 +39,10 @@ int bpf_cong(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != 55601 && - skops->local_port != 55601) - return -1; + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; -- cgit v1.2.3 From e1853319fc4c7279a3561b1844af4a02487c93ef Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 10 Nov 2017 22:19:54 -0800 Subject: bpf: Fix tcp_iw_kern.c sample program The program was returning -1 in some cases which is not allowed by the verifier any longer. Fixes: 390ee7e29fc8 ("bpf: enforce return code for cgroup-bpf programs") Signed-off-by: Lawrence Brakmo Signed-off-by: David S. Miller --- samples/bpf/tcp_iw_kern.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'samples') diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c index 23c5122ef819..4ca5ecc9f580 100644 --- a/samples/bpf/tcp_iw_kern.c +++ b/samples/bpf/tcp_iw_kern.c @@ -42,8 +42,10 @@ int bpf_iw(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != 55601 && - skops->local_port != 55601) - return -1; + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; @@ -62,8 +64,8 @@ int bpf_iw(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of active connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, - &bufsize, sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, &iw, @@ -73,8 +75,8 @@ int bpf_iw(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of passive connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, - &bufsize, sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); break; default: rv = -1; -- cgit v1.2.3 From 03e982eed419da37e9cac1d759097dbe10447190 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 10 Nov 2017 22:19:55 -0800 Subject: bpf: Fix tcp_clamp_kern.c sample program The program was returning -1 in some cases which is not allowed by the verifier any longer. Fixes: 390ee7e29fc8 ("bpf: enforce return code for cgroup-bpf programs") Signed-off-by: Lawrence Brakmo Signed-off-by: David S. Miller --- samples/bpf/tcp_clamp_kern.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'samples') diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c index d68eadd9ca2d..f4225c9d2c0c 100644 --- a/samples/bpf/tcp_clamp_kern.c +++ b/samples/bpf/tcp_clamp_kern.c @@ -41,8 +41,10 @@ int bpf_clamp(struct bpf_sock_ops *skops) /* For testing purposes, only execute rest of BPF program * if neither port numberis 55601 */ - if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601) - return -1; + if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601) { + skops->reply = -1; + return 0; + } op = (int) skops->op; @@ -66,9 +68,9 @@ int bpf_clamp(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of active connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, - SO_RCVBUF, &bufsize, - sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, + SO_RCVBUF, &bufsize, + sizeof(bufsize)); break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: rv = bpf_setsockopt(skops, SOL_TCP, @@ -80,12 +82,12 @@ int bpf_clamp(struct bpf_sock_ops *skops) rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_SNDCWND_CLAMP, &clamp, sizeof(clamp)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, - SO_SNDBUF, &bufsize, - sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, - SO_RCVBUF, &bufsize, - sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, + SO_SNDBUF, &bufsize, + sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, + SO_RCVBUF, &bufsize, + sizeof(bufsize)); break; default: rv = -1; -- cgit v1.2.3 From f3edacbd697f94a743fff1a3d26910ab99948ba7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 11 Nov 2017 18:24:55 +0900 Subject: bpf: Revert bpf_overrid_function() helper changes. NACK'd by x86 maintainer. Signed-off-by: David S. Miller --- samples/bpf/Makefile | 4 ---- samples/bpf/test_override_return.sh | 15 --------------- samples/bpf/tracex7_kern.c | 16 ---------------- samples/bpf/tracex7_user.c | 28 ---------------------------- 4 files changed, 63 deletions(-) delete mode 100755 samples/bpf/test_override_return.sh delete mode 100644 samples/bpf/tracex7_kern.c delete mode 100644 samples/bpf/tracex7_user.c (limited to 'samples') diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 87db0f9a4c15..3b4945c1eab0 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -15,7 +15,6 @@ hostprogs-y += tracex3 hostprogs-y += tracex4 hostprogs-y += tracex5 hostprogs-y += tracex6 -hostprogs-y += tracex7 hostprogs-y += test_probe_write_user hostprogs-y += trace_output hostprogs-y += lathist @@ -62,7 +61,6 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o -tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o @@ -106,7 +104,6 @@ always += tracex3_kern.o always += tracex4_kern.o always += tracex5_kern.o always += tracex6_kern.o -always += tracex7_kern.o always += sock_flags_kern.o always += test_probe_write_user_kern.o always += trace_output_kern.o @@ -161,7 +158,6 @@ HOSTLOADLIBES_tracex3 += -lelf HOSTLOADLIBES_tracex4 += -lelf -lrt HOSTLOADLIBES_tracex5 += -lelf HOSTLOADLIBES_tracex6 += -lelf -HOSTLOADLIBES_tracex7 += -lelf HOSTLOADLIBES_test_cgrp2_sock2 += -lelf HOSTLOADLIBES_load_sock_ops += -lelf HOSTLOADLIBES_test_probe_write_user += -lelf diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh deleted file mode 100755 index e68b9ee6814b..000000000000 --- a/samples/bpf/test_override_return.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -rm -f testfile.img -dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 -DEVICE=$(losetup --show -f testfile.img) -mkfs.btrfs -f $DEVICE -mkdir tmpmnt -./tracex7 $DEVICE -if [ $? -eq 0 ] -then - echo "SUCCESS!" -else - echo "FAILED!" -fi -losetup -d $DEVICE diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c deleted file mode 100644 index 1ab308a43e0f..000000000000 --- a/samples/bpf/tracex7_kern.c +++ /dev/null @@ -1,16 +0,0 @@ -#include -#include -#include -#include "bpf_helpers.h" - -SEC("kprobe/open_ctree") -int bpf_prog1(struct pt_regs *ctx) -{ - unsigned long rc = -12; - - bpf_override_return(ctx, rc); - return 0; -} - -char _license[] SEC("license") = "GPL"; -u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c deleted file mode 100644 index 8a52ac492e8b..000000000000 --- a/samples/bpf/tracex7_user.c +++ /dev/null @@ -1,28 +0,0 @@ -#define _GNU_SOURCE - -#include -#include -#include -#include "libbpf.h" -#include "bpf_load.h" - -int main(int argc, char **argv) -{ - FILE *f; - char filename[256]; - char command[256]; - int ret; - - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); - return 1; - } - - snprintf(command, 256, "mount %s tmpmnt/", argv[1]); - f = popen(command, "r"); - ret = pclose(f); - - return ret ? 0 : 1; -} -- cgit v1.2.3 From fae45363ae4bac980b1d7451233c7bf3d66d9300 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 14 Nov 2017 09:12:03 +0300 Subject: xdp: sample: Missing curly braces in read_route() The assert statement is supposed to be part of the else branch but the curly braces were accidentally left off. Fixes: 3e29cd0e6563 ("xdp: Sample xdp program implementing ip forward") Signed-off-by: Dan Carpenter Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- samples/bpf/xdp_router_ipv4_user.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'samples') diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c index 2c1fe3f4b1a4..916462112d55 100644 --- a/samples/bpf/xdp_router_ipv4_user.c +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -206,12 +206,13 @@ static void read_route(struct nlmsghdr *nh, int nll) direct_entry.arp.mac = 0; direct_entry.arp.dst = 0; if (route.dst_len == 32) { - if (nh->nlmsg_type == RTM_DELROUTE) + if (nh->nlmsg_type == RTM_DELROUTE) { assert(bpf_map_delete_elem(map_fd[3], &route.dst) == 0); - else + } else { if (bpf_map_lookup_elem(map_fd[2], &route.dst, &direct_entry.arp.mac) == 0) direct_entry.arp.dst = route.dst; assert(bpf_map_update_elem(map_fd[3], &route.dst, &direct_entry, 0) == 0); + } } for (i = 0; i < 4; i++) prefix_key->data[i] = (route.dst >> i * 8) & 0xff; -- cgit v1.2.3