From e086101b150ae8e99e54ab26101ef3835fa9f48d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 13 Oct 2017 13:03:16 -0700 Subject: tcp: add a tracepoint for tcp retransmission We need a real-time notification for tcp retransmission for monitoring. Of course we could use ftrace to dynamically instrument this kernel function too, however we can't retrieve the connection information at the same time, for example perf-tools [1] reads /proc/net/tcp for socket details, which is slow when we have a lots of connections. Therefore, this patch adds a tracepoint for __tcp_retransmit_skb() and exposes src/dst IP addresses and ports of the connection. This also makes it easier to integrate into perf. Note, I expose both IPv4 and IPv6 addresses at the same time: for a IPv4 socket, v4 mapped address is used as IPv6 addresses, for a IPv6 socket, LOOPBACK4_IPV6 is already filled by kernel. Also, add sk and skb pointers as they are useful for BPF. 1. https://github.com/brendangregg/perf-tools/blob/master/net/tcpretrans Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Hannes Frederic Sowa Cc: Brendan Gregg Cc: Neal Cardwell Signed-off-by: Cong Wang Acked-by: Alexei Starovoitov Acked-by: Brendan Gregg Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 include/trace/events/tcp.h (limited to 'include/trace') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h new file mode 100644 index 000000000000..3d1cbd072b7e --- /dev/null +++ b/include/trace/events/tcp.h @@ -0,0 +1,68 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM tcp + +#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TCP_H + +#include +#include +#include +#include + +TRACE_EVENT(tcp_retransmit_skb, + + TP_PROTO(struct sock *sk, struct sk_buff *skb), + + TP_ARGS(sk, skb), + + TP_STRUCT__entry( + __field(void *, skbaddr) + __field(void *, skaddr) + __field(__u16, sport) + __field(__u16, dport) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + ), + + TP_fast_assign( + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *pin6; + __be32 *p32; + + __entry->skbaddr = skb; + __entry->skaddr = sk; + + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + + p32 = (__be32 *) __entry->saddr; + *p32 = inet->inet_saddr; + + p32 = (__be32 *) __entry->daddr; + *p32 = inet->inet_daddr; + + if (np) { + pin6 = (struct in6_addr *)__entry->saddr_v6; + *pin6 = np->saddr; + pin6 = (struct in6_addr *)__entry->daddr_v6; + *pin6 = *(np->daddr_cache); + } else { + pin6 = (struct in6_addr *)__entry->saddr_v6; + ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); + pin6 = (struct in6_addr *)__entry->daddr_v6; + ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); + } + ), + + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6 daddrv6=%pI6", + __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6) +); + +#endif /* _TRACE_TCP_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 9185a610f8f7f1b4e4d28c9de27d1969cf58e0f1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 12 Oct 2017 18:40:02 -0400 Subject: tracing: bpf: Hide bpf trace events when they are not used All the trace events defined in include/trace/events/bpf.h are only used when CONFIG_BPF_SYSCALL is defined. But this file gets included by include/linux/bpf_trace.h which is included by the networking code with CREATE_TRACE_POINTS defined. If a trace event is created but not used it still has data structures and functions created for its use, even though nothing is using them. To not waste space, do not define the BPF trace events in bpf.h unless CONFIG_BPF_SYSCALL is defined. Signed-off-by: Steven Rostedt (VMware) Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/trace/events/bpf.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h index 52c8425d144b..1fb58faa4a44 100644 --- a/include/trace/events/bpf.h +++ b/include/trace/events/bpf.h @@ -4,6 +4,9 @@ #if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BPF_H +/* These are only used within the BPF_SYSCALL code */ +#ifdef CONFIG_BPF_SYSCALL + #include #include #include @@ -345,7 +348,7 @@ TRACE_EVENT(bpf_map_next_key, __print_hex(__get_dynamic_array(nxt), __entry->key_len), __entry->key_trunc ? " ..." : "") ); - +#endif /* CONFIG_BPF_SYSCALL */ #endif /* _TRACE_BPF_H */ #include -- cgit v1.2.3 From 9c270af37bb62e708e3e4415d653ce73e713df02 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:34 +0200 Subject: bpf: XDP_REDIRECT enable use of cpumap This patch connects cpumap to the xdp_do_redirect_map infrastructure. Still no SKB allocation are done yet. The XDP frames are transferred to the other CPU, but they are simply refcnt decremented on the remote CPU. This served as a good benchmark for measuring the overhead of remote refcnt decrement. If driver page recycle cache is not efficient then this, exposes a bottleneck in the page allocator. A shout-out to MST's ptr_ring, which is the secret behind is being so efficient to transfer memory pointers between CPUs, without constantly bouncing cache-lines between CPUs. V3: Handle !CONFIG_BPF_SYSCALL pointed out by kbuild test robot. V4: Make Generic-XDP aware of cpumap type, but don't allow redirect yet, as implementation require a separate upstream discussion. V5: - Fix a maybe-uninitialized pointed out by kbuild test robot. - Restrict bpf-prog side access to cpumap, open when use-cases appear - Implement cpu_map_enqueue() as a more simple void pointer enqueue V6: - Allow cpumap type for usage in helper bpf_redirect_map, general bpf-prog side restriction moved to earlier patch. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/trace/events/xdp.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 4e16c43fba10..eb2ece96c1a2 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -136,12 +136,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, __entry->map_id, __entry->map_index) ); +#define devmap_ifindex(fwd, map) \ + (!fwd ? 0 : \ + (!map ? 0 : \ + ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((struct net_device *)fwd)->ifindex : 0))) + #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ - trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0, \ + trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ 0, map, idx) #define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err) \ - trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0, \ + trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \ err, map, idx) #endif /* _TRACE_XDP_H */ -- cgit v1.2.3 From f9419f7bd7a5318b636a941a0214c5cdfa6f6530 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:44 +0200 Subject: bpf: cpumap add tracepoints This adds two tracepoint to the cpumap. One for the enqueue side trace_xdp_cpumap_enqueue() and one for the kthread dequeue side trace_xdp_cpumap_kthread(). To mitigate the tracepoint overhead, these are invoked during the enqueue/dequeue bulking phases, thus amortizing the cost. The obvious use-cases are for debugging and monitoring. The non-intuitive use-case is using these as a feedback loop to know the system load. One can imagine auto-scaling by reducing, adding or activating more worker CPUs on demand. V4: tracepoint remove time_limit info, instead add sched info V8: intro struct bpf_cpu_map_entry members cpu+map_id in this patch Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/trace/events/xdp.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index eb2ece96c1a2..0c8dec61987e 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -150,6 +150,76 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \ err, map, idx) +TRACE_EVENT(xdp_cpumap_kthread, + + TP_PROTO(int map_id, unsigned int processed, unsigned int drops, + int sched), + + TP_ARGS(map_id, processed, drops, sched), + + TP_STRUCT__entry( + __field(int, map_id) + __field(u32, act) + __field(int, cpu) + __field(unsigned int, drops) + __field(unsigned int, processed) + __field(int, sched) + ), + + TP_fast_assign( + __entry->map_id = map_id; + __entry->act = XDP_REDIRECT; + __entry->cpu = smp_processor_id(); + __entry->drops = drops; + __entry->processed = processed; + __entry->sched = sched; + ), + + TP_printk("kthread" + " cpu=%d map_id=%d action=%s" + " processed=%u drops=%u" + " sched=%d", + __entry->cpu, __entry->map_id, + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), + __entry->processed, __entry->drops, + __entry->sched) +); + +TRACE_EVENT(xdp_cpumap_enqueue, + + TP_PROTO(int map_id, unsigned int processed, unsigned int drops, + int to_cpu), + + TP_ARGS(map_id, processed, drops, to_cpu), + + TP_STRUCT__entry( + __field(int, map_id) + __field(u32, act) + __field(int, cpu) + __field(unsigned int, drops) + __field(unsigned int, processed) + __field(int, to_cpu) + ), + + TP_fast_assign( + __entry->map_id = map_id; + __entry->act = XDP_REDIRECT; + __entry->cpu = smp_processor_id(); + __entry->drops = drops; + __entry->processed = processed; + __entry->to_cpu = to_cpu; + ), + + TP_printk("enqueue" + " cpu=%d map_id=%d action=%s" + " processed=%u drops=%u" + " to_cpu=%d", + __entry->cpu, __entry->map_id, + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), + __entry->processed, __entry->drops, + __entry->to_cpu) +); + #endif /* _TRACE_XDP_H */ #include -- cgit v1.2.3 From fb6ff75e18937a20dbec1eb47b5f893f38eabae4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 16 Oct 2017 14:24:02 -0700 Subject: tcp: Use pI6c in tcp tracepoint The compact form for IPv6 addresses is more user friendly than the full version. For example: compact: 2001:db8:1::1 full: 2001:0db8:0001:0000:0000:0000:0000:0004i Update the tcp tracepoint to show the compact form. Signed-off-by: David Ahern Acked-by: Cong Wang Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 3d1cbd072b7e..1ffab6d96e94 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -57,7 +57,7 @@ TRACE_EVENT(tcp_retransmit_skb, } ), - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6 daddrv6=%pI6", + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6) ); -- cgit v1.2.3 From 386fd5da401dc6c4b0ab6a54d333609876b699fe Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 16 Oct 2017 15:32:07 -0700 Subject: tcp: Check daddr_cache before use in tracepoint Running perf in one window to capture tcp_retransmit_skb tracepoint: $ perf record -e tcp:tcp_retransmit_skb -a And causing a retransmission on an active TCP session (e.g., dropping packets in the receiver, changing MTU on the interface to 500 and back to 1500) triggers a panic: [ 58.543144] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 [ 58.545300] IP: perf_trace_tcp_retransmit_skb+0xd0/0x145 [ 58.546770] PGD 0 P4D 0 [ 58.547472] Oops: 0000 [#1] SMP [ 58.548328] Modules linked in: vrf [ 58.549262] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.14.0-rc4+ #26 [ 58.551004] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 58.554560] task: ffffffff81a0e540 task.stack: ffffffff81a00000 [ 58.555817] RIP: 0010:perf_trace_tcp_retransmit_skb+0xd0/0x145 [ 58.557137] RSP: 0018:ffff88003fc03d68 EFLAGS: 00010282 [ 58.558292] RAX: 0000000000000000 RBX: ffffe8ffffc0ec80 RCX: ffff880038543098 [ 58.559850] RDX: 0400000000000000 RSI: ffff88003fc03d70 RDI: ffff88003fc14b68 [ 58.561099] RBP: ffff88003fc03da8 R08: 0000000000000000 R09: ffffea0000d3224a [ 58.562005] R10: ffff88003fc03db8 R11: 0000000000000010 R12: ffff8800385428c0 [ 58.562930] R13: ffffe8ffffc0e478 R14: ffffffff81a93a40 R15: ffff88003d4f0c00 [ 58.563845] FS: 0000000000000000(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000 [ 58.564873] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 58.565613] CR2: 0000000000000008 CR3: 000000003d68f004 CR4: 00000000000606f0 [ 58.566538] Call Trace: [ 58.566865] [ 58.567140] __tcp_retransmit_skb+0x4ab/0x4c6 [ 58.567704] ? tcp_set_ca_state+0x22/0x3f [ 58.568231] tcp_retransmit_skb+0x14/0xa3 [ 58.568754] tcp_retransmit_timer+0x472/0x5e3 [ 58.569324] ? tcp_write_timer_handler+0x1e9/0x1e9 [ 58.569946] tcp_write_timer_handler+0x95/0x1e9 [ 58.570548] tcp_write_timer+0x2a/0x58 Check that daddr_cache is non-NULL before de-referencing. Fixes: e086101b150a ("tcp: add a tracepoint for tcp retransmission") Signed-off-by: David Ahern Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 1ffab6d96e94..f51c130f1e0f 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -27,7 +27,6 @@ TRACE_EVENT(tcp_retransmit_skb, ), TP_fast_assign( - struct ipv6_pinfo *np = inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); struct in6_addr *pin6; __be32 *p32; @@ -44,11 +43,12 @@ TRACE_EVENT(tcp_retransmit_skb, p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; - if (np) { + /* IPv6 socket ? */ + if (inet6_sk(sk)) { pin6 = (struct in6_addr *)__entry->saddr_v6; - *pin6 = np->saddr; + *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; - *pin6 = *(np->daddr_cache); + *pin6 = sk->sk_v6_daddr; } else { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); -- cgit v1.2.3 From 890056783c60ad9d0789774af2bc10fe4f27dd9d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Oct 2017 08:17:29 -0700 Subject: tcp: Remove use of inet6_sk and add IPv6 checks to tracepoint 386fd5da401d ("tcp: Check daddr_cache before use in tracepoint") was the second version of the tracepoint fixup patch. This patch is the delta between v2 and v3. Specifically, remove the use of inet6_sk and check sk_family as requested by Eric and add IS_ENABLED(CONFIG_IPV6) around the use of sk_v6_rcv_saddr and sk_v6_daddr as done in sock_common (noted by Cong). Signed-off-by: David Ahern Reviewed-by: Eric Dumazet Tested-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index f51c130f1e0f..c3220d914475 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -43,13 +43,15 @@ TRACE_EVENT(tcp_retransmit_skb, p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; - /* IPv6 socket ? */ - if (inet6_sk(sk)) { +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { pin6 = (struct in6_addr *)__entry->saddr_v6; *pin6 = sk->sk_v6_rcv_saddr; pin6 = (struct in6_addr *)__entry->daddr_v6; *pin6 = sk->sk_v6_daddr; - } else { + } else +#endif + { pin6 = (struct in6_addr *)__entry->saddr_v6; ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); pin6 = (struct in6_addr *)__entry->daddr_v6; -- cgit v1.2.3 From b65f164d37cf6d4aac59b0e13c2e5c4cfe293fd2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 19 Oct 2017 09:31:43 +0200 Subject: ipv6: let trace_fib6_table_lookup() dereference the fib table The perf traces for ipv6 routing code show a relevant cost around trace_fib6_table_lookup(), even if no trace is enabled. This is due to the fib6_table de-referencing currently performed by the caller. Let's the tracing code pay this overhead, passing to the trace helper the table pointer. This gives small but measurable performance improvement under UDP flood. Signed-off-by: Paolo Abeni Acked-by: Steven Rostedt (VMware) Acked-by: David Ahern Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/trace/events/fib6.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index d60096cddb2a..b34bed17abc7 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -12,9 +12,9 @@ TRACE_EVENT(fib6_table_lookup, TP_PROTO(const struct net *net, const struct rt6_info *rt, - u32 tb_id, const struct flowi6 *flp), + struct fib6_table *table, const struct flowi6 *flp), - TP_ARGS(net, rt, tb_id, flp), + TP_ARGS(net, rt, table, flp), TP_STRUCT__entry( __field( u32, tb_id ) @@ -34,7 +34,7 @@ TRACE_EVENT(fib6_table_lookup, TP_fast_assign( struct in6_addr *in6; - __entry->tb_id = tb_id; + __entry->tb_id = table->tb6_id; __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; __entry->tos = ip6_tclass(flp->flowlabel); -- cgit v1.2.3 From f6e37b25413cf636369668652e9752ee77c7d9f7 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:22 -0700 Subject: tcp: add trace event class tcp_event_sk_skb Introduce event class tcp_event_sk_skb for tcp tracepoints that have arguments sk and skb. Existing tracepoint trace_tcp_retransmit_skb() falls into this class. This patch rewrites the definition of trace_tcp_retransmit_skb() with tcp_event_sk_skb. Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index c3220d914475..14b0a7083f1d 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -9,7 +9,13 @@ #include #include -TRACE_EVENT(tcp_retransmit_skb, +/* + * tcp event with arguments sk and skb + * + * Note: this class requires a valid sk pointer; while skb pointer could + * be NULL. + */ +DECLARE_EVENT_CLASS(tcp_event_sk_skb, TP_PROTO(struct sock *sk, struct sk_buff *skb), @@ -64,6 +70,13 @@ TRACE_EVENT(tcp_retransmit_skb, __entry->saddr_v6, __entry->daddr_v6) ); +DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, + + TP_PROTO(struct sock *sk, struct sk_buff *skb), + + TP_ARGS(sk, skb) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 7344e29f285a94b965075599731811c352f3ab40 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:23 -0700 Subject: tcp: mark trace event arguments sk and skb as const Some functions that we plan to add trace points require const sk and/or skb. So we mark these fields as const in the tracepoint. Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 14b0a7083f1d..2b6fe72c6781 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -17,13 +17,13 @@ */ DECLARE_EVENT_CLASS(tcp_event_sk_skb, - TP_PROTO(struct sock *sk, struct sk_buff *skb), + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( - __field(void *, skbaddr) - __field(void *, skaddr) + __field(const void *, skbaddr) + __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) __array(__u8, saddr, 4) @@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb, DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, - TP_PROTO(struct sock *sk, struct sk_buff *skb), + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); -- cgit v1.2.3 From c24b14c46bb88d844275de5c4024c8745ae89d42 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:24 -0700 Subject: tcp: add tracepoint trace_tcp_send_reset New tracepoint trace_tcp_send_reset is added and called from tcp_v4_send_reset(), tcp_v6_send_reset() and tcp_send_active_reset(). Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 2b6fe72c6781..3e57e1ae1c6b 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -77,6 +77,17 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, TP_ARGS(sk, skb) ); +/* + * skb of trace_tcp_send_reset is the skb that caused RST. In case of + * active reset, skb should be NULL + */ +DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, + + TP_PROTO(const struct sock *sk, const struct sk_buff *skb), + + TP_ARGS(sk, skb) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 5941521c05d69cf3f2b1293eefd21207e083b70f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:25 -0700 Subject: tcp: add tracepoint trace_tcp_receive_reset New tracepoint trace_tcp_receive_reset is added and called from tcp_reset(). This tracepoint is define with a new class tcp_event_sk. Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 3e57e1ae1c6b..c83c71187719 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -88,6 +88,72 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, TP_ARGS(sk, skb) ); +/* + * tcp event with arguments sk + * + * Note: this class requires a valid sk pointer. + */ +DECLARE_EVENT_CLASS(tcp_event_sk, + + TP_PROTO(const struct sock *sk), + + TP_ARGS(sk), + + TP_STRUCT__entry( + __field(const void *, skaddr) + __field(__u16, sport) + __field(__u16, dport) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + ), + + TP_fast_assign( + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *pin6; + __be32 *p32; + + __entry->skaddr = sk; + + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + + p32 = (__be32 *) __entry->saddr; + *p32 = inet->inet_saddr; + + p32 = (__be32 *) __entry->daddr; + *p32 = inet->inet_daddr; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + pin6 = (struct in6_addr *)__entry->saddr_v6; + *pin6 = sk->sk_v6_rcv_saddr; + pin6 = (struct in6_addr *)__entry->daddr_v6; + *pin6 = sk->sk_v6_daddr; + } else +#endif + { + pin6 = (struct in6_addr *)__entry->saddr_v6; + ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); + pin6 = (struct in6_addr *)__entry->daddr_v6; + ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); + } + ), + + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", + __entry->sport, __entry->dport, + __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6) +); + +DEFINE_EVENT(tcp_event_sk, tcp_receive_reset, + + TP_PROTO(const struct sock *sk), + + TP_ARGS(sk) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ -- cgit v1.2.3 From e1a4aa50f47303ebb3ca0cfd01687884551ce03d Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:26 -0700 Subject: tcp: add tracepoint trace_tcp_destroy_sock This patch adds trace event trace_tcp_destroy_sock. Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index c83c71187719..1724c12c25cf 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -154,6 +154,13 @@ DEFINE_EVENT(tcp_event_sk, tcp_receive_reset, TP_ARGS(sk) ); +DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, + + TP_PROTO(const struct sock *sk), + + TP_ARGS(sk) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ -- cgit v1.2.3 From e8fce23946b7e7eadf25ad78d8207c22903dfe27 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 23 Oct 2017 09:20:27 -0700 Subject: tcp: add tracepoint trace_tcp_set_state() This patch adds tracepoint trace_tcp_set_state. Besides usual fields (s/d ports, IP addresses), old and new state of the socket is also printed with TP_printk, with __print_symbolic(). Signed-off-by: Song Liu Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 1724c12c25cf..03699ba71623 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -9,6 +9,22 @@ #include #include +#define tcp_state_name(state) { state, #state } +#define show_tcp_state_name(val) \ + __print_symbolic(val, \ + tcp_state_name(TCP_ESTABLISHED), \ + tcp_state_name(TCP_SYN_SENT), \ + tcp_state_name(TCP_SYN_RECV), \ + tcp_state_name(TCP_FIN_WAIT1), \ + tcp_state_name(TCP_FIN_WAIT2), \ + tcp_state_name(TCP_TIME_WAIT), \ + tcp_state_name(TCP_CLOSE), \ + tcp_state_name(TCP_CLOSE_WAIT), \ + tcp_state_name(TCP_LAST_ACK), \ + tcp_state_name(TCP_LISTEN), \ + tcp_state_name(TCP_CLOSING), \ + tcp_state_name(TCP_NEW_SYN_RECV)) + /* * tcp event with arguments sk and skb * @@ -161,6 +177,66 @@ DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, TP_ARGS(sk) ); +TRACE_EVENT(tcp_set_state, + + TP_PROTO(const struct sock *sk, const int oldstate, const int newstate), + + TP_ARGS(sk, oldstate, newstate), + + TP_STRUCT__entry( + __field(const void *, skaddr) + __field(int, oldstate) + __field(int, newstate) + __field(__u16, sport) + __field(__u16, dport) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + ), + + TP_fast_assign( + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *pin6; + __be32 *p32; + + __entry->skaddr = sk; + __entry->oldstate = oldstate; + __entry->newstate = newstate; + + __entry->sport = ntohs(inet->inet_sport); + __entry->dport = ntohs(inet->inet_dport); + + p32 = (__be32 *) __entry->saddr; + *p32 = inet->inet_saddr; + + p32 = (__be32 *) __entry->daddr; + *p32 = inet->inet_daddr; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + pin6 = (struct in6_addr *)__entry->saddr_v6; + *pin6 = sk->sk_v6_rcv_saddr; + pin6 = (struct in6_addr *)__entry->daddr_v6; + *pin6 = sk->sk_v6_daddr; + } else +#endif + { + pin6 = (struct in6_addr *)__entry->saddr_v6; + ipv6_addr_set_v4mapped(inet->inet_saddr, pin6); + pin6 = (struct in6_addr *)__entry->daddr_v6; + ipv6_addr_set_v4mapped(inet->inet_daddr, pin6); + } + ), + + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s", + __entry->sport, __entry->dport, + __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6, + show_tcp_state_name(__entry->oldstate), + show_tcp_state_name(__entry->newstate)) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ -- cgit v1.2.3 From e87c6bc3852b981e71c757be20771546ce9f76f3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 23 Oct 2017 23:53:08 -0700 Subject: bpf: permit multiple bpf attachments for a single perf event This patch enables multiple bpf attachments for a kprobe/uprobe/tracepoint single trace event. Each trace_event keeps a list of attached perf events. When an event happens, all attached bpf programs will be executed based on the order of attachment. A global bpf_event_mutex lock is introduced to protect prog_array attaching and detaching. An alternative will be introduce a mutex lock in every trace_event_call structure, but it takes a lot of extra memory. So a global bpf_event_mutex lock is a good compromise. The bpf prog detachment involves allocation of memory. If the allocation fails, a dummy do-nothing program will replace to-be-detached program in-place. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/trace/perf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/trace') diff --git a/include/trace/perf.h b/include/trace/perf.h index 04fe68bbe767..14f127b6acf5 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -34,7 +34,6 @@ perf_trace_##call(void *__data, proto) \ struct trace_event_call *event_call = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ struct trace_event_raw_##call *entry; \ - struct bpf_prog *prog = event_call->prog; \ struct pt_regs *__regs; \ u64 __count = 1; \ struct task_struct *__task = NULL; \ @@ -46,8 +45,9 @@ perf_trace_##call(void *__data, proto) \ __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \ \ head = this_cpu_ptr(event_call->perf_events); \ - if (!prog && __builtin_constant_p(!__task) && !__task && \ - hlist_empty(head)) \ + if (!bpf_prog_array_valid(event_call) && \ + __builtin_constant_p(!__task) && !__task && \ + hlist_empty(head)) \ return; \ \ __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ -- cgit v1.2.3 From cf34ce3da1e41579296364509266c7dac573822a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 30 Oct 2017 14:41:35 -0700 Subject: tcp: add tracepoint trace_tcp_retransmit_synack() This tracepoint can be used to trace synack retransmits. It maintains pointer to struct request_sock. We cannot simply reuse trace_tcp_retransmit_skb() here, because the sk here is the LISTEN socket. The IP addresses and ports should be extracted from struct request_sock. Note that, like many other tracepoints, this patch uses IS_ENABLED in TP_fast_assign macro, which triggers sparse warning like: ./include/trace/events/tcp.h:274:1: error: directive in argument list ./include/trace/events/tcp.h:281:1: error: directive in argument list However, there is no good solution to avoid these warnings. To the best of our knowledge, these warnings are harmless. Signed-off-by: Song Liu Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/trace/events/tcp.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 03699ba71623..07cccca6cbf1 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -237,6 +237,62 @@ TRACE_EVENT(tcp_set_state, show_tcp_state_name(__entry->newstate)) ); +TRACE_EVENT(tcp_retransmit_synack, + + TP_PROTO(const struct sock *sk, const struct request_sock *req), + + TP_ARGS(sk, req), + + TP_STRUCT__entry( + __field(const void *, skaddr) + __field(const void *, req) + __field(__u16, sport) + __field(__u16, dport) + __array(__u8, saddr, 4) + __array(__u8, daddr, 4) + __array(__u8, saddr_v6, 16) + __array(__u8, daddr_v6, 16) + ), + + TP_fast_assign( + struct inet_request_sock *ireq = inet_rsk(req); + struct in6_addr *pin6; + __be32 *p32; + + __entry->skaddr = sk; + __entry->req = req; + + __entry->sport = ireq->ir_num; + __entry->dport = ntohs(ireq->ir_rmt_port); + + p32 = (__be32 *) __entry->saddr; + *p32 = ireq->ir_loc_addr; + + p32 = (__be32 *) __entry->daddr; + *p32 = ireq->ir_rmt_addr; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + pin6 = (struct in6_addr *)__entry->saddr_v6; + *pin6 = ireq->ir_v6_loc_addr; + pin6 = (struct in6_addr *)__entry->daddr_v6; + *pin6 = ireq->ir_v6_rmt_addr; + } else +#endif + { + pin6 = (struct in6_addr *)__entry->saddr_v6; + ipv6_addr_set_v4mapped(ireq->ir_loc_addr, pin6); + pin6 = (struct in6_addr *)__entry->daddr_v6; + ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, pin6); + } + ), + + TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", + __entry->sport, __entry->dport, + __entry->saddr, __entry->daddr, + __entry->saddr_v6, __entry->daddr_v6) +); + #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ -- cgit v1.2.3 From a3dcaf17ee54f1d01d22cc2b22cab0b4f60d78cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Nov 2017 00:29:27 -0800 Subject: net: allow per netns sysctl_rmem and sysctl_wmem for protos As we want to gradually implement per netns sysctl_rmem and sysctl_wmem on per protocol basis, add two new fields in struct proto, and two new helpers : sk_get_wmem0() and sk_get_rmem0() First user will be TCP. Then UDP and SCTP can be easily converted, while DECNET probably wont get this support. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/trace/events/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 6d31c0520ef3..ec4dade24466 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -48,7 +48,7 @@ TRACE_EVENT(sock_exceed_buf_limit, strncpy(__entry->name, prot->name, 32); __entry->sysctl_mem = prot->sysctl_mem; __entry->allocated = allocated; - __entry->sysctl_rmem = prot->sysctl_rmem[0]; + __entry->sysctl_rmem = sk_get_rmem0(sk, prot); __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); ), -- cgit v1.2.3