diff options
Diffstat (limited to 'samples')
-rw-r--r-- | samples/bpf/Makefile | 5 | ||||
-rw-r--r-- | samples/bpf/bpf_load.c | 22 | ||||
-rw-r--r-- | samples/bpf/cookie_uid_helper_example.c | 2 | ||||
-rw-r--r-- | samples/bpf/cpustat_kern.c | 281 | ||||
-rw-r--r-- | samples/bpf/cpustat_user.c | 219 | ||||
-rw-r--r-- | samples/bpf/tcbpf2_kern.c | 6 | ||||
-rwxr-xr-x | samples/bpf/test_cgrp2_sock.sh | 1 | ||||
-rwxr-xr-x | samples/bpf/test_cgrp2_sock2.sh | 3 | ||||
-rw-r--r-- | samples/bpf/test_overhead_raw_tp_kern.c | 17 | ||||
-rw-r--r-- | samples/bpf/test_overhead_user.c | 12 | ||||
-rwxr-xr-x | samples/bpf/test_tunnel_bpf.sh | 5 | ||||
-rw-r--r-- | samples/bpf/trace_event_kern.c | 4 | ||||
-rw-r--r-- | samples/bpf/trace_event_user.c | 15 | ||||
-rw-r--r-- | samples/bpf/xdp_redirect_user.c | 7 | ||||
-rw-r--r-- | samples/sockmap/Makefile | 2 | ||||
-rw-r--r-- | samples/sockmap/sockmap_kern.c | 239 | ||||
-rwxr-xr-x | samples/sockmap/sockmap_test.sh | 488 | ||||
-rw-r--r-- | samples/sockmap/sockmap_user.c | 360 |
18 files changed, 1653 insertions, 35 deletions
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index ec3fc8d88e87..4d6a6edd4bf6 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -43,6 +43,7 @@ hostprogs-y += xdp_redirect_cpu hostprogs-y += xdp_monitor hostprogs-y += xdp_rxq_info hostprogs-y += syscall_tp +hostprogs-y += cpustat # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o @@ -93,6 +94,7 @@ xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o +cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -117,6 +119,7 @@ always += offwaketime_kern.o always += spintest_kern.o always += map_perf_test_kern.o always += test_overhead_tp_kern.o +always += test_overhead_raw_tp_kern.o always += test_overhead_kprobe_kern.o always += parse_varlen.o parse_simple.o parse_ldabs.o always += test_cgrp2_tc_kern.o @@ -144,6 +147,7 @@ always += xdp_monitor_kern.o always += xdp_rxq_info_kern.o always += xdp2skb_meta_kern.o always += syscall_tp_kern.o +always += cpustat_kern.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -188,6 +192,7 @@ HOSTLOADLIBES_xdp_redirect_cpu += -lelf HOSTLOADLIBES_xdp_monitor += -lelf HOSTLOADLIBES_xdp_rxq_info += -lelf HOSTLOADLIBES_syscall_tp += -lelf +HOSTLOADLIBES_cpustat += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 69806d74fa53..bebe4188b4b3 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -61,12 +61,14 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; + bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0; bool is_xdp = strncmp(event, "xdp", 3) == 0; bool is_perf_event = strncmp(event, "perf_event", 10) == 0; bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; bool is_sockops = strncmp(event, "sockops", 7) == 0; bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0; + bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0; size_t insns_cnt = size / sizeof(struct bpf_insn); enum bpf_prog_type prog_type; char buf[256]; @@ -84,6 +86,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_KPROBE; } else if (is_tracepoint) { prog_type = BPF_PROG_TYPE_TRACEPOINT; + } else if (is_raw_tracepoint) { + prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT; } else if (is_xdp) { prog_type = BPF_PROG_TYPE_XDP; } else if (is_perf_event) { @@ -96,6 +100,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_SOCK_OPS; } else if (is_sk_skb) { prog_type = BPF_PROG_TYPE_SK_SKB; + } else if (is_sk_msg) { + prog_type = BPF_PROG_TYPE_SK_MSG; } else { printf("Unknown event '%s'\n", event); return -1; @@ -113,7 +119,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) return 0; - if (is_socket || is_sockops || is_sk_skb) { + if (is_socket || is_sockops || is_sk_skb || is_sk_msg) { if (is_socket) event += 6; else @@ -128,6 +134,16 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) return populate_prog_array(event, fd); } + if (is_raw_tracepoint) { + efd = bpf_raw_tracepoint_open(event + 15, fd); + if (efd < 0) { + printf("tracepoint %s %s\n", event + 15, strerror(errno)); + return -1; + } + event_fd[prog_cnt - 1] = efd; + return 0; + } + if (is_kprobe || is_kretprobe) { if (is_kprobe) event += 7; @@ -584,12 +600,14 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) if (memcmp(shname, "kprobe/", 7) == 0 || memcmp(shname, "kretprobe/", 10) == 0 || memcmp(shname, "tracepoint/", 11) == 0 || + memcmp(shname, "raw_tracepoint/", 15) == 0 || memcmp(shname, "xdp", 3) == 0 || memcmp(shname, "perf_event", 10) == 0 || memcmp(shname, "socket", 6) == 0 || memcmp(shname, "cgroup/", 7) == 0 || memcmp(shname, "sockops", 7) == 0 || - memcmp(shname, "sk_skb", 6) == 0) { + memcmp(shname, "sk_skb", 6) == 0 || + memcmp(shname, "sk_msg", 6) == 0) { ret = load_and_attach(shname, data->d_buf, data->d_size); if (ret != 0) diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c index 9d751e209f31..8eca27e595ae 100644 --- a/samples/bpf/cookie_uid_helper_example.c +++ b/samples/bpf/cookie_uid_helper_example.c @@ -246,7 +246,7 @@ static void udp_client(void) recv_len = recvfrom(s_rcv, &buf, sizeof(buf), 0, (struct sockaddr *)&si_me, &slen); if (recv_len < 0) - error(1, errno, "revieve\n"); + error(1, errno, "receive\n"); res = memcmp(&(si_other.sin_addr), &(si_me.sin_addr), sizeof(si_me.sin_addr)); if (res != 0) diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c new file mode 100644 index 000000000000..68c84da065b1 --- /dev/null +++ b/samples/bpf/cpustat_kern.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/version.h> +#include <linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +/* + * The CPU number, cstate number and pstate number are based + * on 96boards Hikey with octa CA53 CPUs. + * + * Every CPU have three idle states for cstate: + * WFI, CPU_OFF, CLUSTER_OFF + * + * Every CPU have 5 operating points: + * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz + * + * This code is based on these assumption and other platforms + * need to adjust these definitions. + */ +#define MAX_CPU 8 +#define MAX_PSTATE_ENTRIES 5 +#define MAX_CSTATE_ENTRIES 3 + +static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 }; + +/* + * my_map structure is used to record cstate and pstate index and + * timestamp (Idx, Ts), when new event incoming we need to update + * combination for new state index and timestamp (Idx`, Ts`). + * + * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time + * interval for the previous state: Duration(Idx) = Ts` - Ts. + * + * Every CPU has one below array for recording state index and + * timestamp, and record for cstate and pstate saperately: + * + * +--------------------------+ + * | cstate timestamp | + * +--------------------------+ + * | cstate index | + * +--------------------------+ + * | pstate timestamp | + * +--------------------------+ + * | pstate index | + * +--------------------------+ + */ +#define MAP_OFF_CSTATE_TIME 0 +#define MAP_OFF_CSTATE_IDX 1 +#define MAP_OFF_PSTATE_TIME 2 +#define MAP_OFF_PSTATE_IDX 3 +#define MAP_OFF_NUM 4 + +struct bpf_map_def SEC("maps") my_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = MAX_CPU * MAP_OFF_NUM, +}; + +/* cstate_duration records duration time for every idle state per CPU */ +struct bpf_map_def SEC("maps") cstate_duration = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = MAX_CPU * MAX_CSTATE_ENTRIES, +}; + +/* pstate_duration records duration time for every operating point per CPU */ +struct bpf_map_def SEC("maps") pstate_duration = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = MAX_CPU * MAX_PSTATE_ENTRIES, +}; + +/* + * The trace events for cpu_idle and cpu_frequency are taken from: + * /sys/kernel/debug/tracing/events/power/cpu_idle/format + * /sys/kernel/debug/tracing/events/power/cpu_frequency/format + * + * These two events have same format, so define one common structure. + */ +struct cpu_args { + u64 pad; + u32 state; + u32 cpu_id; +}; + +/* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */ +static u32 find_cpu_pstate_idx(u32 frequency) +{ + u32 i; + + for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) { + if (frequency == cpu_opps[i]) + return i; + } + + return i; +} + +SEC("tracepoint/power/cpu_idle") +int bpf_prog1(struct cpu_args *ctx) +{ + u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta; + u32 key, cpu, pstate_idx; + u64 *val; + + if (ctx->cpu_id > MAX_CPU) + return 0; + + cpu = ctx->cpu_id; + + key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME; + cts = bpf_map_lookup_elem(&my_map, &key); + if (!cts) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; + cstate = bpf_map_lookup_elem(&my_map, &key); + if (!cstate) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; + pts = bpf_map_lookup_elem(&my_map, &key); + if (!pts) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; + pstate = bpf_map_lookup_elem(&my_map, &key); + if (!pstate) + return 0; + + prev_state = *cstate; + *cstate = ctx->state; + + if (!*cts) { + *cts = bpf_ktime_get_ns(); + return 0; + } + + cur_ts = bpf_ktime_get_ns(); + delta = cur_ts - *cts; + *cts = cur_ts; + + /* + * When state doesn't equal to (u32)-1, the cpu will enter + * one idle state; for this case we need to record interval + * for the pstate. + * + * OPP2 + * +---------------------+ + * OPP1 | | + * ---------+ | + * | Idle state + * +--------------- + * + * |<- pstate duration ->| + * ^ ^ + * pts cur_ts + */ + if (ctx->state != (u32)-1) { + + /* record pstate after have first cpu_frequency event */ + if (!*pts) + return 0; + + delta = cur_ts - *pts; + + pstate_idx = find_cpu_pstate_idx(*pstate); + if (pstate_idx >= MAX_PSTATE_ENTRIES) + return 0; + + key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; + val = bpf_map_lookup_elem(&pstate_duration, &key); + if (val) + __sync_fetch_and_add((long *)val, delta); + + /* + * When state equal to (u32)-1, the cpu just exits from one + * specific idle state; for this case we need to record + * interval for the pstate. + * + * OPP2 + * -----------+ + * | OPP1 + * | +----------- + * | Idle state | + * +---------------------+ + * + * |<- cstate duration ->| + * ^ ^ + * cts cur_ts + */ + } else { + + key = cpu * MAX_CSTATE_ENTRIES + prev_state; + val = bpf_map_lookup_elem(&cstate_duration, &key); + if (val) + __sync_fetch_and_add((long *)val, delta); + } + + /* Update timestamp for pstate as new start time */ + if (*pts) + *pts = cur_ts; + + return 0; +} + +SEC("tracepoint/power/cpu_frequency") +int bpf_prog2(struct cpu_args *ctx) +{ + u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta; + u32 key, cpu, pstate_idx; + u64 *val; + + cpu = ctx->cpu_id; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; + pts = bpf_map_lookup_elem(&my_map, &key); + if (!pts) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; + pstate = bpf_map_lookup_elem(&my_map, &key); + if (!pstate) + return 0; + + key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; + cstate = bpf_map_lookup_elem(&my_map, &key); + if (!cstate) + return 0; + + prev_state = *pstate; + *pstate = ctx->state; + + if (!*pts) { + *pts = bpf_ktime_get_ns(); + return 0; + } + + cur_ts = bpf_ktime_get_ns(); + delta = cur_ts - *pts; + *pts = cur_ts; + + /* When CPU is in idle, bail out to skip pstate statistics */ + if (*cstate != (u32)(-1)) + return 0; + + /* + * The cpu changes to another different OPP (in below diagram + * change frequency from OPP3 to OPP1), need recording interval + * for previous frequency OPP3 and update timestamp as start + * time for new frequency OPP1. + * + * OPP3 + * +---------------------+ + * OPP2 | | + * ---------+ | + * | OPP1 + * +--------------- + * + * |<- pstate duration ->| + * ^ ^ + * pts cur_ts + */ + pstate_idx = find_cpu_pstate_idx(*pstate); + if (pstate_idx >= MAX_PSTATE_ENTRIES) + return 0; + + key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; + val = bpf_map_lookup_elem(&pstate_duration, &key); + if (val) + __sync_fetch_and_add((long *)val, delta); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/cpustat_user.c b/samples/bpf/cpustat_user.c new file mode 100644 index 000000000000..2b4cd1ae57c5 --- /dev/null +++ b/samples/bpf/cpustat_user.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <sched.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <linux/bpf.h> +#include <locale.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <sys/wait.h> + +#include "libbpf.h" +#include "bpf_load.h" + +#define MAX_CPU 8 +#define MAX_PSTATE_ENTRIES 5 +#define MAX_CSTATE_ENTRIES 3 +#define MAX_STARS 40 + +#define CPUFREQ_MAX_SYSFS_PATH "/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq" +#define CPUFREQ_LOWEST_FREQ "208000" +#define CPUFREQ_HIGHEST_FREQ "12000000" + +struct cpu_stat_data { + unsigned long cstate[MAX_CSTATE_ENTRIES]; + unsigned long pstate[MAX_PSTATE_ENTRIES]; +}; + +static struct cpu_stat_data stat_data[MAX_CPU]; + +static void cpu_stat_print(void) +{ + int i, j; + char state_str[sizeof("cstate-9")]; + struct cpu_stat_data *data; + + /* Clear screen */ + printf("\033[2J"); + + /* Header */ + printf("\nCPU states statistics:\n"); + printf("%-10s ", "state(ms)"); + + for (i = 0; i < MAX_CSTATE_ENTRIES; i++) { + sprintf(state_str, "cstate-%d", i); + printf("%-11s ", state_str); + } + + for (i = 0; i < MAX_PSTATE_ENTRIES; i++) { + sprintf(state_str, "pstate-%d", i); + printf("%-11s ", state_str); + } + + printf("\n"); + + for (j = 0; j < MAX_CPU; j++) { + data = &stat_data[j]; + + printf("CPU-%-6d ", j); + for (i = 0; i < MAX_CSTATE_ENTRIES; i++) + printf("%-11ld ", data->cstate[i] / 1000000); + + for (i = 0; i < MAX_PSTATE_ENTRIES; i++) + printf("%-11ld ", data->pstate[i] / 1000000); + + printf("\n"); + } +} + +static void cpu_stat_update(int cstate_fd, int pstate_fd) +{ + unsigned long key, value; + int c, i; + + for (c = 0; c < MAX_CPU; c++) { + for (i = 0; i < MAX_CSTATE_ENTRIES; i++) { + key = c * MAX_CSTATE_ENTRIES + i; + bpf_map_lookup_elem(cstate_fd, &key, &value); + stat_data[c].cstate[i] = value; + } + + for (i = 0; i < MAX_PSTATE_ENTRIES; i++) { + key = c * MAX_PSTATE_ENTRIES + i; + bpf_map_lookup_elem(pstate_fd, &key, &value); + stat_data[c].pstate[i] = value; + } + } +} + +/* + * This function is copied from 'idlestat' tool function + * idlestat_wake_all() in idlestate.c. + * + * It sets the self running task affinity to cpus one by one so can wake up + * the specific CPU to handle scheduling; this results in all cpus can be + * waken up once and produce ftrace event 'trace_cpu_idle'. + */ +static int cpu_stat_inject_cpu_idle_event(void) +{ + int rcpu, i, ret; + cpu_set_t cpumask; + cpu_set_t original_cpumask; + + ret = sysconf(_SC_NPROCESSORS_CONF); + if (ret < 0) + return -1; + + rcpu = sched_getcpu(); + if (rcpu < 0) + return -1; + + /* Keep track of the CPUs we will run on */ + sched_getaffinity(0, sizeof(original_cpumask), &original_cpumask); + + for (i = 0; i < ret; i++) { + + /* Pointless to wake up ourself */ + if (i == rcpu) + continue; + + /* Pointless to wake CPUs we will not run on */ + if (!CPU_ISSET(i, &original_cpumask)) + continue; + + CPU_ZERO(&cpumask); + CPU_SET(i, &cpumask); + + sched_setaffinity(0, sizeof(cpumask), &cpumask); + } + + /* Enable all the CPUs of the original mask */ + sched_setaffinity(0, sizeof(original_cpumask), &original_cpumask); + return 0; +} + +/* + * It's possible to have no any frequency change for long time and cannot + * get ftrace event 'trace_cpu_frequency' for long period, this introduces + * big deviation for pstate statistics. + * + * To solve this issue, below code forces to set 'scaling_max_freq' to 208MHz + * for triggering ftrace event 'trace_cpu_frequency' and then recovery back to + * the maximum frequency value 1.2GHz. + */ +static int cpu_stat_inject_cpu_frequency_event(void) +{ + int len, fd; + + fd = open(CPUFREQ_MAX_SYSFS_PATH, O_WRONLY); + if (fd < 0) { + printf("failed to open scaling_max_freq, errno=%d\n", errno); + return fd; + } + + len = write(fd, CPUFREQ_LOWEST_FREQ, strlen(CPUFREQ_LOWEST_FREQ)); + if (len < 0) { + printf("failed to open scaling_max_freq, errno=%d\n", errno); + goto err; + } + + len = write(fd, CPUFREQ_HIGHEST_FREQ, strlen(CPUFREQ_HIGHEST_FREQ)); + if (len < 0) { + printf("failed to open scaling_max_freq, errno=%d\n", errno); + goto err; + } + +err: + close(fd); + return len; +} + +static void int_exit(int sig) +{ + cpu_stat_inject_cpu_idle_event(); + cpu_stat_inject_cpu_frequency_event(); + cpu_stat_update(map_fd[1], map_fd[2]); + cpu_stat_print(); + exit(0); +} + +int main(int argc, char **argv) +{ + char filename[256]; + int ret; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + ret = cpu_stat_inject_cpu_idle_event(); + if (ret < 0) + return 1; + + ret = cpu_stat_inject_cpu_frequency_event(); + if (ret < 0) + return 1; + + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + while (1) { + cpu_stat_update(map_fd[1], map_fd[2]); + cpu_stat_print(); + sleep(5); + } + + return 0; +} diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c index efdc16d195ff..9a8db7bd6db4 100644 --- a/samples/bpf/tcbpf2_kern.c +++ b/samples/bpf/tcbpf2_kern.c @@ -52,7 +52,8 @@ int _gre_set_tunnel(struct __sk_buff *skb) key.tunnel_tos = 0; key.tunnel_ttl = 64; - ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), BPF_F_ZERO_CSUM_TX); + ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), + BPF_F_ZERO_CSUM_TX | BPF_F_SEQ_NUMBER); if (ret < 0) { ERROR(ret); return TC_ACT_SHOT; @@ -92,7 +93,8 @@ int _ip6gretap_set_tunnel(struct __sk_buff *skb) key.tunnel_label = 0xabcde; ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), - BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX); + BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | + BPF_F_SEQ_NUMBER); if (ret < 0) { ERROR(ret); return TC_ACT_SHOT; diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh index 8ee0371a100a..9f6174236856 100755 --- a/samples/bpf/test_cgrp2_sock.sh +++ b/samples/bpf/test_cgrp2_sock.sh @@ -61,6 +61,7 @@ cleanup_and_exit() [ -n "$msg" ] && echo "ERROR: $msg" + test_cgrp2_sock -d ${CGRP_MNT}/sockopts ip li del cgrp2_sock umount ${CGRP_MNT} diff --git a/samples/bpf/test_cgrp2_sock2.sh b/samples/bpf/test_cgrp2_sock2.sh index fc4e64d00cb3..0f396a86e0cb 100755 --- a/samples/bpf/test_cgrp2_sock2.sh +++ b/samples/bpf/test_cgrp2_sock2.sh @@ -28,6 +28,9 @@ function attach_bpf { } function cleanup { + if [ -d /tmp/cgroupv2/foo ]; then + test_cgrp2_sock -d /tmp/cgroupv2/foo + fi ip link del veth0b ip netns delete at_ns0 umount /tmp/cgroupv2 diff --git a/samples/bpf/test_overhead_raw_tp_kern.c b/samples/bpf/test_overhead_raw_tp_kern.c new file mode 100644 index 000000000000..d2af8bc1c805 --- /dev/null +++ b/samples/bpf/test_overhead_raw_tp_kern.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018 Facebook */ +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +SEC("raw_tracepoint/task_rename") +int prog(struct bpf_raw_tracepoint_args *ctx) +{ + return 0; +} + +SEC("raw_tracepoint/urandom_read") +int prog2(struct bpf_raw_tracepoint_args *ctx) +{ + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c index d291167fd3c7..e1d35e07a10e 100644 --- a/samples/bpf/test_overhead_user.c +++ b/samples/bpf/test_overhead_user.c @@ -158,5 +158,17 @@ int main(int argc, char **argv) unload_progs(); } + if (test_flags & 0xC0) { + snprintf(filename, sizeof(filename), + "%s_raw_tp_kern.o", argv[0]); + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + printf("w/RAW_TRACEPOINT\n"); + run_perf_test(num_cpu, test_flags >> 6); + unload_progs(); + } + return 0; } diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh index 43ce049996ee..c265863ccdf9 100755 --- a/samples/bpf/test_tunnel_bpf.sh +++ b/samples/bpf/test_tunnel_bpf.sh @@ -23,7 +23,8 @@ function config_device { function add_gre_tunnel { # in namespace ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE key 2 local 172.16.1.100 remote 172.16.1.200 + ip link add dev $DEV_NS type $TYPE seq key 2 \ + local 172.16.1.100 remote 172.16.1.200 ip netns exec at_ns0 ip link set dev $DEV_NS up ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 @@ -43,7 +44,7 @@ function add_ip6gretap_tunnel { # in namespace ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE flowlabel 0xbcdef key 2 \ + ip link add dev $DEV_NS type $TYPE seq flowlabel 0xbcdef key 2 \ local ::11 remote ::22 ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c index a77a583d94d4..7068fbdde951 100644 --- a/samples/bpf/trace_event_kern.c +++ b/samples/bpf/trace_event_kern.c @@ -39,6 +39,7 @@ int bpf_prog1(struct bpf_perf_event_data *ctx) { char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu"; char time_fmt2[] = "Get Time Failed, ErrCode: %d"; + char addr_fmt[] = "Address recorded on event: %llx"; char fmt[] = "CPU-%d period %lld ip %llx"; u32 cpu = bpf_get_smp_processor_id(); struct bpf_perf_event_value value_buf; @@ -64,6 +65,9 @@ int bpf_prog1(struct bpf_perf_event_data *ctx) else bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret); + if (ctx->addr != 0) + bpf_trace_printk(addr_fmt, sizeof(addr_fmt), ctx->addr); + val = bpf_map_lookup_elem(&counts, &key); if (val) (*val)++; diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c index bf4f1b6d9a52..56f7a259a7c9 100644 --- a/samples/bpf/trace_event_user.c +++ b/samples/bpf/trace_event_user.c @@ -215,6 +215,17 @@ static void test_bpf_perf_event(void) /* Intel Instruction Retired */ .config = 0xc0, }; + struct perf_event_attr attr_type_raw_lock_load = { + .sample_freq = SAMPLE_FREQ, + .freq = 1, + .type = PERF_TYPE_RAW, + /* Intel MEM_UOPS_RETIRED.LOCK_LOADS */ + .config = 0x21d0, + /* Request to record lock address from PEBS */ + .sample_type = PERF_SAMPLE_ADDR, + /* Record address value requires precise event */ + .precise_ip = 2, + }; printf("Test HW_CPU_CYCLES\n"); test_perf_event_all_cpu(&attr_type_hw); @@ -236,6 +247,10 @@ static void test_bpf_perf_event(void) test_perf_event_all_cpu(&attr_type_raw); test_perf_event_task(&attr_type_raw); + printf("Test Lock Load\n"); + test_perf_event_all_cpu(&attr_type_raw_lock_load); + test_perf_event_task(&attr_type_raw_lock_load); + printf("*** PASS ***\n"); } diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index d54e91eb6cbf..b701b5c21342 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -20,6 +20,7 @@ #include <string.h> #include <unistd.h> #include <libgen.h> +#include <sys/resource.h> #include "bpf_load.h" #include "bpf_util.h" @@ -75,6 +76,7 @@ static void usage(const char *prog) int main(int argc, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; const char *optstr = "SN"; char filename[256]; int ret, opt, key = 0; @@ -98,6 +100,11 @@ int main(int argc, char **argv) return 1; } + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + ifindex_in = strtoul(argv[optind], NULL, 0); ifindex_out = strtoul(argv[optind + 1], NULL, 0); printf("input: %d output: %d\n", ifindex_in, ifindex_out); diff --git a/samples/sockmap/Makefile b/samples/sockmap/Makefile index 73f1da4d116c..9bf2881bd11b 100644 --- a/samples/sockmap/Makefile +++ b/samples/sockmap/Makefile @@ -2,7 +2,7 @@ hostprogs-y := sockmap # Libbpf dependencies -LIBBPF := ../../tools/lib/bpf/bpf.o +LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c index 52b0053274f4..9ff8bc5dc206 100644 --- a/samples/sockmap/sockmap_kern.c +++ b/samples/sockmap/sockmap_kern.c @@ -43,6 +43,55 @@ struct bpf_map_def SEC("maps") sock_map = { .max_entries = 20, }; +struct bpf_map_def SEC("maps") sock_map_txmsg = { + .type = BPF_MAP_TYPE_SOCKMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 20, +}; + +struct bpf_map_def SEC("maps") sock_map_redir = { + .type = BPF_MAP_TYPE_SOCKMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 20, +}; + +struct bpf_map_def SEC("maps") sock_apply_bytes = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1 +}; + +struct bpf_map_def SEC("maps") sock_cork_bytes = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1 +}; + +struct bpf_map_def SEC("maps") sock_pull_bytes = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 2 +}; + +struct bpf_map_def SEC("maps") sock_redir_flags = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1 +}; + +struct bpf_map_def SEC("maps") sock_skb_opts = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1 +}; + SEC("sk_skb1") int bpf_prog1(struct __sk_buff *skb) { @@ -54,15 +103,24 @@ int bpf_prog2(struct __sk_buff *skb) { __u32 lport = skb->local_port; __u32 rport = skb->remote_port; - int ret = 0; + int len, *f, ret, zero = 0; + __u64 flags = 0; if (lport == 10000) ret = 10; else ret = 1; - bpf_printk("sockmap: %d -> %d @ %d\n", lport, bpf_ntohl(rport), ret); - return bpf_sk_redirect_map(skb, &sock_map, ret, 0); + len = (__u32)skb->data_end - (__u32)skb->data; + f = bpf_map_lookup_elem(&sock_skb_opts, &zero); + if (f && *f) { + ret = 3; + flags = *f; + } + + bpf_printk("sk_skb2: redirect(%iB) flags=%i\n", + len, flags); + return bpf_sk_redirect_map(skb, &sock_map, ret, flags); } SEC("sockops") @@ -105,4 +163,179 @@ int bpf_sockmap(struct bpf_sock_ops *skops) return 0; } + +SEC("sk_msg1") +int bpf_prog4(struct sk_msg_md *msg) +{ + int *bytes, zero = 0, one = 1; + int *start, *end; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + bpf_msg_cork_bytes(msg, *bytes); + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) + bpf_msg_pull_data(msg, *start, *end, 0); + return SK_PASS; +} + +SEC("sk_msg2") +int bpf_prog5(struct sk_msg_md *msg) +{ + int err1 = -1, err2 = -1, zero = 0, one = 1; + int *bytes, *start, *end, len1, len2; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + err1 = bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + err2 = bpf_msg_cork_bytes(msg, *bytes); + len1 = (__u64)msg->data_end - (__u64)msg->data; + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) { + int err; + + bpf_printk("sk_msg2: pull(%i:%i)\n", + start ? *start : 0, end ? *end : 0); + err = bpf_msg_pull_data(msg, *start, *end, 0); + if (err) + bpf_printk("sk_msg2: pull_data err %i\n", + err); + len2 = (__u64)msg->data_end - (__u64)msg->data; + bpf_printk("sk_msg2: length update %i->%i\n", + len1, len2); + } + bpf_printk("sk_msg2: data length %i err1 %i err2 %i\n", + len1, err1, err2); + return SK_PASS; +} + +SEC("sk_msg3") +int bpf_prog6(struct sk_msg_md *msg) +{ + int *bytes, zero = 0, one = 1, key = 0; + int *start, *end, *f; + __u64 flags = 0; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + bpf_msg_cork_bytes(msg, *bytes); + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) + bpf_msg_pull_data(msg, *start, *end, 0); + f = bpf_map_lookup_elem(&sock_redir_flags, &zero); + if (f && *f) { + key = 2; + flags = *f; + } + return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); +} + +SEC("sk_msg4") +int bpf_prog7(struct sk_msg_md *msg) +{ + int err1 = 0, err2 = 0, zero = 0, one = 1, key = 0; + int *f, *bytes, *start, *end, len1, len2; + __u64 flags = 0; + + int err; + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + err1 = bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + err2 = bpf_msg_cork_bytes(msg, *bytes); + len1 = (__u64)msg->data_end - (__u64)msg->data; + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) { + + bpf_printk("sk_msg2: pull(%i:%i)\n", + start ? *start : 0, end ? *end : 0); + err = bpf_msg_pull_data(msg, *start, *end, 0); + if (err) + bpf_printk("sk_msg2: pull_data err %i\n", + err); + len2 = (__u64)msg->data_end - (__u64)msg->data; + bpf_printk("sk_msg2: length update %i->%i\n", + len1, len2); + } + f = bpf_map_lookup_elem(&sock_redir_flags, &zero); + if (f && *f) { + key = 2; + flags = *f; + } + bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n", + len1, flags, err1 ? err1 : err2); + err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); + bpf_printk("sk_msg3: err %i\n", err); + return err; +} + +SEC("sk_msg5") +int bpf_prog8(struct sk_msg_md *msg) +{ + void *data_end = (void *)(long) msg->data_end; + void *data = (void *)(long) msg->data; + int ret = 0, *bytes, zero = 0; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) { + ret = bpf_msg_apply_bytes(msg, *bytes); + if (ret) + return SK_DROP; + } else { + return SK_DROP; + } + return SK_PASS; +} +SEC("sk_msg6") +int bpf_prog9(struct sk_msg_md *msg) +{ + void *data_end = (void *)(long) msg->data_end; + void *data = (void *)(long) msg->data; + int ret = 0, *bytes, zero = 0; + + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) { + if (((__u64)data_end - (__u64)data) >= *bytes) + return SK_PASS; + ret = bpf_msg_cork_bytes(msg, *bytes); + if (ret) + return SK_DROP; + } + return SK_PASS; +} + +SEC("sk_msg7") +int bpf_prog10(struct sk_msg_md *msg) +{ + int *bytes, zero = 0, one = 1; + int *start, *end; + + bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); + if (bytes) + bpf_msg_apply_bytes(msg, *bytes); + bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero); + if (bytes) + bpf_msg_cork_bytes(msg, *bytes); + start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); + end = bpf_map_lookup_elem(&sock_pull_bytes, &one); + if (start && end) + bpf_msg_pull_data(msg, *start, *end, 0); + + return SK_DROP; +} + + char _license[] SEC("license") = "GPL"; diff --git a/samples/sockmap/sockmap_test.sh b/samples/sockmap/sockmap_test.sh new file mode 100755 index 000000000000..ace75f070eb8 --- /dev/null +++ b/samples/sockmap/sockmap_test.sh @@ -0,0 +1,488 @@ +#Test a bunch of positive cases to verify basic functionality +for prog in "--txmsg_redir --txmsg_skb" "--txmsg_redir --txmsg_ingress" "--txmsg" "--txmsg_redir" "--txmsg_redir --txmsg_ingress" "--txmsg_drop"; do +for t in "sendmsg" "sendpage"; do +for r in 1 10 100; do + for i in 1 10 100; do + for l in 1 10 100; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 + done + done +done +done +done + +#Test max iov +t="sendmsg" +r=1 +i=1024 +l=1 +prog="--txmsg" + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" +echo $TEST +$TEST +sleep 2 +prog="--txmsg_redir" +TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" +echo $TEST +$TEST + +# Test max iov with 1k send + +t="sendmsg" +r=1 +i=1024 +l=1024 +prog="--txmsg" + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" +echo $TEST +$TEST +sleep 2 +prog="--txmsg_redir" +TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" +echo $TEST +$TEST +sleep 2 + +# Test apply with 1B +r=1 +i=1024 +l=1024 +prog="--txmsg_apply 1" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply with larger value than send +r=1 +i=8 +l=1024 +prog="--txmsg_apply 2048" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply with apply that never reaches limit +r=1024 +i=1 +l=1 +prog="--txmsg_apply 2048" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply and redirect with 1B +r=1 +i=1024 +l=1024 +prog="--txmsg_redir --txmsg_apply 1" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +prog="--txmsg_redir --txmsg_apply 1 --txmsg_ingress" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +prog="--txmsg_redir --txmsg_apply 1 --txmsg_skb" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + + +# Test apply and redirect with larger value than send +r=1 +i=8 +l=1024 +prog="--txmsg_redir --txmsg_apply 2048" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +prog="--txmsg_redir --txmsg_apply 2048 --txmsg_ingress" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +prog="--txmsg_redir --txmsg_apply 2048 --txmsg_skb" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + + +# Test apply and redirect with apply that never reaches limit +r=1024 +i=1 +l=1 +prog="--txmsg_apply 2048" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with 1B not really useful but test it anyways +r=1 +i=1024 +l=1024 +prog="--txmsg_cork 1" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with a more reasonable 100B +r=1 +i=1000 +l=1000 +prog="--txmsg_cork 100" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with larger value than send +r=1 +i=8 +l=1024 +prog="--txmsg_cork 2048" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with cork that never reaches limit +r=1024 +i=1 +l=1 +prog="--txmsg_cork 2048" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +r=1 +i=1024 +l=1024 +prog="--txmsg_redir --txmsg_cork 1" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with a more reasonable 100B +r=1 +i=1000 +l=1000 +prog="--txmsg_redir --txmsg_cork 100" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with larger value than send +r=1 +i=8 +l=1024 +prog="--txmsg_redir --txmsg_cork 2048" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test cork with cork that never reaches limit +r=1024 +i=1 +l=1 +prog="--txmsg_cork 2048" + +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + + +# mix and match cork and apply not really useful but valid programs + +# Test apply < cork +r=100 +i=1 +l=5 +prog="--txmsg_apply 10 --txmsg_cork 100" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Try again with larger sizes so we hit overflow case +r=100 +i=1000 +l=2048 +prog="--txmsg_apply 4096 --txmsg_cork 8096" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply > cork +r=100 +i=1 +l=5 +prog="--txmsg_apply 100 --txmsg_cork 10" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Again with larger sizes so we hit overflow cases +r=100 +i=1000 +l=2048 +prog="--txmsg_apply 8096 --txmsg_cork 4096" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + + +# Test apply = cork +r=100 +i=1 +l=5 +prog="--txmsg_apply 10 --txmsg_cork 10" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +r=100 +i=1000 +l=2048 +prog="--txmsg_apply 4096 --txmsg_cork 4096" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply < cork +r=100 +i=1 +l=5 +prog="--txmsg_redir --txmsg_apply 10 --txmsg_cork 100" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Try again with larger sizes so we hit overflow case +r=100 +i=1000 +l=2048 +prog="--txmsg_redir --txmsg_apply 4096 --txmsg_cork 8096" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Test apply > cork +r=100 +i=1 +l=5 +prog="--txmsg_redir --txmsg_apply 100 --txmsg_cork 10" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Again with larger sizes so we hit overflow cases +r=100 +i=1000 +l=2048 +prog="--txmsg_redir --txmsg_apply 8096 --txmsg_cork 4096" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + + +# Test apply = cork +r=100 +i=1 +l=5 +prog="--txmsg_redir --txmsg_apply 10 --txmsg_cork 10" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +r=100 +i=1000 +l=2048 +prog="--txmsg_redir --txmsg_apply 4096 --txmsg_cork 4096" +for t in "sendpage" "sendmsg"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +# Tests for bpf_msg_pull_data() +for i in `seq 99 100 1600`; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 0 --txmsg_end $i --txmsg_cork 1600" + echo $TEST + $TEST + sleep 2 +done + +for i in `seq 199 100 1600`; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 100 --txmsg_end $i --txmsg_cork 1600" + echo $TEST + $TEST + sleep 2 +done + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 1500 --txmsg_end 1600 --txmsg_cork 1600" +echo $TEST +$TEST +sleep 2 + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 1111 --txmsg_end 1112 --txmsg_cork 1600" +echo $TEST +$TEST +sleep 2 + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 1111 --txmsg_end 0 --txmsg_cork 1600" +echo $TEST +$TEST +sleep 2 + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 0 --txmsg_end 1601 --txmsg_cork 1600" +echo $TEST +$TEST +sleep 2 + +TEST="./sockmap --cgroup /mnt/cgroup2/ -t sendpage -r 16 -i 1 -l 100 \ + --txmsg --txmsg_start 0 --txmsg_end 1601 --txmsg_cork 1602" +echo $TEST +$TEST +sleep 2 + +# Run through gamut again with start and end +for prog in "--txmsg" "--txmsg_redir" "--txmsg_drop"; do +for t in "sendmsg" "sendpage"; do +for r in 1 10 100; do + for i in 1 10 100; do + for l in 1 10 100; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog --txmsg_start 1 --txmsg_end 2" + echo $TEST + $TEST + sleep 2 + done + done +done +done +done + +# Some specific tests to cover specific code paths +./sockmap --cgroup /mnt/cgroup2/ -t sendpage \ + -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 3 +./sockmap --cgroup /mnt/cgroup2/ -t sendmsg \ + -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 3 +./sockmap --cgroup /mnt/cgroup2/ -t sendpage \ + -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 5 +./sockmap --cgroup /mnt/cgroup2/ -t sendmsg \ + -r 5 -i 1 -l 1 --txmsg_redir --txmsg_cork 5 --txmsg_apply 5 diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index 7c25c0c112bc..6f2334912283 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -29,6 +29,7 @@ #include <sys/time.h> #include <sys/resource.h> #include <sys/types.h> +#include <sys/sendfile.h> #include <linux/netlink.h> #include <linux/socket.h> @@ -54,6 +55,18 @@ void running_handler(int a); /* global sockets */ int s1, s2, c1, c2, p1, p2; +int txmsg_pass; +int txmsg_noisy; +int txmsg_redir; +int txmsg_redir_noisy; +int txmsg_drop; +int txmsg_apply; +int txmsg_cork; +int txmsg_start; +int txmsg_end; +int txmsg_ingress; +int txmsg_skb; + static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, {"cgroup", required_argument, NULL, 'c' }, @@ -62,6 +75,18 @@ static const struct option long_options[] = { {"iov_count", required_argument, NULL, 'i' }, {"length", required_argument, NULL, 'l' }, {"test", required_argument, NULL, 't' }, + {"data_test", no_argument, NULL, 'd' }, + {"txmsg", no_argument, &txmsg_pass, 1 }, + {"txmsg_noisy", no_argument, &txmsg_noisy, 1 }, + {"txmsg_redir", no_argument, &txmsg_redir, 1 }, + {"txmsg_redir_noisy", no_argument, &txmsg_redir_noisy, 1}, + {"txmsg_drop", no_argument, &txmsg_drop, 1 }, + {"txmsg_apply", required_argument, NULL, 'a'}, + {"txmsg_cork", required_argument, NULL, 'k'}, + {"txmsg_start", required_argument, NULL, 's'}, + {"txmsg_end", required_argument, NULL, 'e'}, + {"txmsg_ingress", no_argument, &txmsg_ingress, 1 }, + {"txmsg_skb", no_argument, &txmsg_skb, 1 }, {0, 0, NULL, 0 } }; @@ -195,19 +220,71 @@ struct msg_stats { struct timespec end; }; +struct sockmap_options { + int verbose; + bool base; + bool sendpage; + bool data_test; + bool drop_expected; +}; + +static int msg_loop_sendpage(int fd, int iov_length, int cnt, + struct msg_stats *s, + struct sockmap_options *opt) +{ + bool drop = opt->drop_expected; + unsigned char k = 0; + FILE *file; + int i, fp; + + file = fopen(".sendpage_tst.tmp", "w+"); + for (i = 0; i < iov_length * cnt; i++, k++) + fwrite(&k, sizeof(char), 1, file); + fflush(file); + fseek(file, 0, SEEK_SET); + fclose(file); + + fp = open(".sendpage_tst.tmp", O_RDONLY); + clock_gettime(CLOCK_MONOTONIC, &s->start); + for (i = 0; i < cnt; i++) { + int sent = sendfile(fd, fp, NULL, iov_length); + + if (!drop && sent < 0) { + perror("send loop error:"); + close(fp); + return sent; + } else if (drop && sent >= 0) { + printf("sendpage loop error expected: %i\n", sent); + close(fp); + return -EIO; + } + + if (sent > 0) + s->bytes_sent += sent; + } + clock_gettime(CLOCK_MONOTONIC, &s->end); + close(fp); + return 0; +} + static int msg_loop(int fd, int iov_count, int iov_length, int cnt, - struct msg_stats *s, bool tx) + struct msg_stats *s, bool tx, + struct sockmap_options *opt) { struct msghdr msg = {0}; int err, i, flags = MSG_NOSIGNAL; struct iovec *iov; + unsigned char k; + bool data_test = opt->data_test; + bool drop = opt->drop_expected; iov = calloc(iov_count, sizeof(struct iovec)); if (!iov) return errno; + k = 0; for (i = 0; i < iov_count; i++) { - char *d = calloc(iov_length, sizeof(char)); + unsigned char *d = calloc(iov_length, sizeof(char)); if (!d) { fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count); @@ -215,21 +292,34 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, } iov[i].iov_base = d; iov[i].iov_len = iov_length; + + if (data_test && tx) { + int j; + + for (j = 0; j < iov_length; j++) + d[j] = k++; + } } msg.msg_iov = iov; msg.msg_iovlen = iov_count; + k = 0; if (tx) { clock_gettime(CLOCK_MONOTONIC, &s->start); for (i = 0; i < cnt; i++) { int sent = sendmsg(fd, &msg, flags); - if (sent < 0) { + if (!drop && sent < 0) { perror("send loop error:"); goto out_errno; + } else if (drop && sent >= 0) { + printf("send loop error expected: %i\n", sent); + errno = -EIO; + goto out_errno; } - s->bytes_sent += sent; + if (sent > 0) + s->bytes_sent += sent; } clock_gettime(CLOCK_MONOTONIC, &s->end); } else { @@ -272,6 +362,26 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, } s->bytes_recvd += recv; + + if (data_test) { + int j; + + for (i = 0; i < msg.msg_iovlen; i++) { + unsigned char *d = iov[i].iov_base; + + for (j = 0; + j < iov[i].iov_len && recv; j++) { + if (d[j] != k++) { + errno = -EIO; + fprintf(stderr, + "detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n", + i, j, d[j], k - 1, d[j+1], k + 1); + goto out_errno; + } + recv--; + } + } + } } clock_gettime(CLOCK_MONOTONIC, &s->end); } @@ -300,7 +410,7 @@ static inline float recvdBps(struct msg_stats s) } static int sendmsg_test(int iov_count, int iov_buf, int cnt, - int verbose, bool base) + struct sockmap_options *opt) { float sent_Bps = 0, recvd_Bps = 0; int rx_fd, txpid, rxpid, err = 0; @@ -309,14 +419,20 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt, errno = 0; - if (base) + if (opt->base) rx_fd = p1; else rx_fd = p2; rxpid = fork(); if (rxpid == 0) { - err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false); + if (opt->drop_expected) + exit(1); + + if (opt->sendpage) + iov_count = 1; + err = msg_loop(rx_fd, iov_count, iov_buf, + cnt, &s, false, opt); if (err) fprintf(stderr, "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", @@ -339,7 +455,12 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt, txpid = fork(); if (txpid == 0) { - err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true); + if (opt->sendpage) + err = msg_loop_sendpage(c1, iov_buf, cnt, &s, opt); + else + err = msg_loop(c1, iov_count, iov_buf, + cnt, &s, true, opt); + if (err) fprintf(stderr, "msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n", @@ -364,7 +485,7 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt, return err; } -static int forever_ping_pong(int rate, int verbose) +static int forever_ping_pong(int rate, struct sockmap_options *opt) { struct timeval timeout; char buf[1024] = {0}; @@ -429,7 +550,7 @@ static int forever_ping_pong(int rate, int verbose) if (rate) sleep(rate); - if (verbose) { + if (opt->verbose) { printf("."); fflush(stdout); @@ -443,20 +564,34 @@ enum { PING_PONG, SENDMSG, BASE, + BASE_SENDPAGE, + SENDPAGE, }; int main(int argc, char **argv) { - int iov_count = 1, length = 1024, rate = 1, verbose = 0; + int iov_count = 1, length = 1024, rate = 1, tx_prog_fd; struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; int opt, longindex, err, cg_fd = 0; + struct sockmap_options options = {0}; int test = PING_PONG; char filename[256]; - while ((opt = getopt_long(argc, argv, "hvc:r:i:l:t:", + while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:", long_options, &longindex)) != -1) { switch (opt) { - /* Cgroup configuration */ + case 's': + txmsg_start = atoi(optarg); + break; + case 'e': + txmsg_end = atoi(optarg); + break; + case 'a': + txmsg_apply = atoi(optarg); + break; + case 'k': + txmsg_cork = atoi(optarg); + break; case 'c': cg_fd = open(optarg, O_DIRECTORY, O_RDONLY); if (cg_fd < 0) { @@ -470,7 +605,7 @@ int main(int argc, char **argv) rate = atoi(optarg); break; case 'v': - verbose = 1; + options.verbose = 1; break; case 'i': iov_count = atoi(optarg); @@ -478,6 +613,9 @@ int main(int argc, char **argv) case 'l': length = atoi(optarg); break; + case 'd': + options.data_test = true; + break; case 't': if (strcmp(optarg, "ping") == 0) { test = PING_PONG; @@ -485,11 +623,17 @@ int main(int argc, char **argv) test = SENDMSG; } else if (strcmp(optarg, "base") == 0) { test = BASE; + } else if (strcmp(optarg, "base_sendpage") == 0) { + test = BASE_SENDPAGE; + } else if (strcmp(optarg, "sendpage") == 0) { + test = SENDPAGE; } else { usage(argv); return -1; } break; + case 0: + break; case 'h': default: usage(argv); @@ -515,16 +659,16 @@ int main(int argc, char **argv) /* catch SIGINT */ signal(SIGINT, running_handler); - /* If base test skip BPF setup */ - if (test == BASE) - goto run; - if (load_bpf_file(filename)) { fprintf(stderr, "load_bpf_file: (%s) %s\n", filename, strerror(errno)); return 1; } + /* If base test skip BPF setup */ + if (test == BASE || test == BASE_SENDPAGE) + goto run; + /* Attach programs to sockmap */ err = bpf_prog_attach(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER, 0); @@ -557,15 +701,183 @@ run: goto out; } - if (test == PING_PONG) - err = forever_ping_pong(rate, verbose); - else if (test == SENDMSG) - err = sendmsg_test(iov_count, length, rate, verbose, false); - else if (test == BASE) - err = sendmsg_test(iov_count, length, rate, verbose, true); + /* Attach txmsg program to sockmap */ + if (txmsg_pass) + tx_prog_fd = prog_fd[3]; + else if (txmsg_noisy) + tx_prog_fd = prog_fd[4]; + else if (txmsg_redir) + tx_prog_fd = prog_fd[5]; + else if (txmsg_redir_noisy) + tx_prog_fd = prog_fd[6]; + else if (txmsg_drop) + tx_prog_fd = prog_fd[9]; + /* apply and cork must be last */ + else if (txmsg_apply) + tx_prog_fd = prog_fd[7]; + else if (txmsg_cork) + tx_prog_fd = prog_fd[8]; else + tx_prog_fd = 0; + + if (tx_prog_fd) { + int redir_fd, i = 0; + + err = bpf_prog_attach(tx_prog_fd, + map_fd[1], BPF_SK_MSG_VERDICT, 0); + if (err) { + fprintf(stderr, + "ERROR: bpf_prog_attach (txmsg): %d (%s)\n", + err, strerror(errno)); + return err; + } + + err = bpf_map_update_elem(map_fd[1], &i, &c1, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg): %d (%s\n", + err, strerror(errno)); + return err; + } + + if (txmsg_redir || txmsg_redir_noisy) + redir_fd = c2; + else + redir_fd = c1; + + err = bpf_map_update_elem(map_fd[2], &i, &redir_fd, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg): %d (%s\n", + err, strerror(errno)); + return err; + } + + if (txmsg_apply) { + err = bpf_map_update_elem(map_fd[3], + &i, &txmsg_apply, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (apply_bytes): %d (%s\n", + err, strerror(errno)); + return err; + } + } + + if (txmsg_cork) { + err = bpf_map_update_elem(map_fd[4], + &i, &txmsg_cork, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (cork_bytes): %d (%s\n", + err, strerror(errno)); + return err; + } + } + + if (txmsg_start) { + err = bpf_map_update_elem(map_fd[5], + &i, &txmsg_start, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg_start): %d (%s)\n", + err, strerror(errno)); + return err; + } + } + + if (txmsg_end) { + i = 1; + err = bpf_map_update_elem(map_fd[5], + &i, &txmsg_end, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg_end): %d (%s)\n", + err, strerror(errno)); + return err; + } + } + + if (txmsg_ingress) { + int in = BPF_F_INGRESS; + + i = 0; + err = bpf_map_update_elem(map_fd[6], &i, &in, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n", + err, strerror(errno)); + } + i = 1; + err = bpf_map_update_elem(map_fd[1], &i, &p1, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (p1 txmsg): %d (%s)\n", + err, strerror(errno)); + } + err = bpf_map_update_elem(map_fd[2], &i, &p1, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (p1 redir): %d (%s)\n", + err, strerror(errno)); + } + + i = 2; + err = bpf_map_update_elem(map_fd[2], &i, &p2, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (p2 txmsg): %d (%s)\n", + err, strerror(errno)); + } + } + + if (txmsg_skb) { + int skb_fd = (test == SENDMSG || test == SENDPAGE) ? p2 : p1; + int ingress = BPF_F_INGRESS; + + i = 0; + err = bpf_map_update_elem(map_fd[7], &i, &ingress, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n", + err, strerror(errno)); + } + + i = 3; + err = bpf_map_update_elem(map_fd[0], &i, &skb_fd, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n", + err, strerror(errno)); + } + } + } + + if (txmsg_drop) + options.drop_expected = true; + + if (test == PING_PONG) + err = forever_ping_pong(rate, &options); + else if (test == SENDMSG) { + options.base = false; + options.sendpage = false; + err = sendmsg_test(iov_count, length, rate, &options); + } else if (test == SENDPAGE) { + options.base = false; + options.sendpage = true; + err = sendmsg_test(iov_count, length, rate, &options); + } else if (test == BASE) { + options.base = true; + options.sendpage = false; + err = sendmsg_test(iov_count, length, rate, &options); + } else if (test == BASE_SENDPAGE) { + options.base = true; + options.sendpage = true; + err = sendmsg_test(iov_count, length, rate, &options); + } else fprintf(stderr, "unknown test\n"); out: + bpf_prog_detach2(prog_fd[2], cg_fd, BPF_CGROUP_SOCK_OPS); close(s1); close(s2); close(p1); |