From 8919a9b31eb4fb4c0a93e5fb350a626924302aa6 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:32 -0400 Subject: tcp: Only init congestion control if not initialized already Change tcp_init_transfer() to only initialize congestion control if it has not been initialized already. With this new approach, we can arrange things so that if the EBPF code sets the congestion control by calling setsockopt(TCP_CONGESTION) then tcp_init_transfer() will not re-initialize the CC module. This is an approach that has the following beneficial properties: (1) This allows CC module customizations made by the EBPF called in tcp_init_transfer() to persist, and not be wiped out by a later call to tcp_init_congestion_control() in tcp_init_transfer(). (2) Does not flip the order of EBPF and CC init, to avoid causing bugs for existing code upstream that depends on the current order. (3) Does not cause 2 initializations for for CC in the case where the EBPF called in tcp_init_transfer() wants to set the CC to a new CC algorithm. (4) Allows follow-on simplifications to the code in net/core/filter.c and net/ipv4/tcp_cong.c, which currently both have some complexity to special-case CC initialization to avoid double CC initialization if EBPF sets the CC. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- net/ipv4/tcp.c | 1 + net/ipv4/tcp_cong.c | 3 ++- net/ipv4/tcp_input.c | 4 +++- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 57a568875539..7360d3db2b61 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2698,6 +2698,7 @@ int tcp_disconnect(struct sock *sk, int flags) if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 62878cf26d9c..d18d7a1ce4ce 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -176,7 +176,7 @@ void tcp_assign_congestion_control(struct sock *sk) void tcp_init_congestion_control(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) @@ -185,6 +185,7 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); + icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4337841faeff..0e5ac0d33fd3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5894,8 +5894,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); tp->snd_cwnd_stamp = tcp_jiffies32; + icsk->icsk_ca_initialized = 0; bpf_skops_established(sk, bpf_op, skb); - tcp_init_congestion_control(sk); + if (!icsk->icsk_ca_initialized) + tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); } -- cgit v1.2.3 From 29a949325c6c90f1431db9af64592275c83d9b2a Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:34 -0400 Subject: tcp: simplify tcp_set_congestion_control(): Always reinitialize Now that the previous patches ensure that all call sites for tcp_set_congestion_control() want to initialize congestion control, we can simplify tcp_set_congestion_control() by removing the reinit argument and the code to support it. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_cong.c | 11 ++--------- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7360d3db2b61..e58ab9db73ff 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3050,7 +3050,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, true, + err = tcp_set_congestion_control(sk, name, true, ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); release_sock(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index d18d7a1ce4ce..a9b0fb52a1ec 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -341,7 +341,7 @@ out: * already initialized. */ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin) + bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -365,15 +365,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, if (!ca) { err = -ENOENT; } else if (!load) { - const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; - if (bpf_try_module_get(ca, ca->owner)) { - if (reinit) { - tcp_reinit_congestion_control(sk, ca); - } else { - icsk->icsk_ca_ops = ca; - bpf_module_put(old_ca, old_ca->owner); - } + tcp_reinit_congestion_control(sk, ca); } else { err = -EBUSY; } -- cgit v1.2.3 From 5050bef8736facac341fb7e2c0eda5002619acf8 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:36 -0400 Subject: tcp: Simplify tcp_set_congestion_control() load=false case Simplify tcp_set_congestion_control() by removing the initialization code path for the !load case. There are only two call sites for tcp_set_congestion_control(). The EBPF call site is the only one that passes load=false; it also passes cap_net_admin=true. Because of that, the exact same behavior can be achieved by removing the special if (!load) branch of the logic. Both before and after this commit, the EBPF case will call bpf_try_module_get(), and if that succeeds then call tcp_reinit_congestion_control() or if that fails then return EBUSY. Note that this returns the logic to a structure very similar to the structure before: commit 91b5b21c7c16 ("bpf: Add support for changing congestion control") except that the CAP_NET_ADMIN status is passed in as a function argument. This clean-up was suggested by Martin KaFai Lau. Suggested-by: Martin KaFai Lau Signed-off-by: Neal Cardwell Signed-off-by: Alexei Starovoitov Cc: Lawrence Brakmo Cc: Eric Dumazet Cc: Yuchung Cheng Cc: Kevin Yang --- net/ipv4/tcp_cong.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index a9b0fb52a1ec..db47ac24d057 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -362,21 +362,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, goto out; } - if (!ca) { + if (!ca) err = -ENOENT; - } else if (!load) { - if (bpf_try_module_get(ca, ca->owner)) { - tcp_reinit_congestion_control(sk, ca); - } else { - err = -EBUSY; - } - } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; - } else if (!bpf_try_module_get(ca, ca->owner)) { + else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; - } else { + else tcp_reinit_congestion_control(sk, ca); - } out: rcu_read_unlock(); return err; -- cgit v1.2.3 From 9436ef6e862b9ca22e5b12f87b106e07d5af4cae Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:20 +0100 Subject: bpf: Allow specifying a BTF ID per argument in function protos Function prototypes using ARG_PTR_TO_BTF_ID currently use two ways to signal which BTF IDs are acceptable. First, bpf_func_proto.btf_id is an array of IDs, one for each argument. This array is only accessed up to the highest numbered argument that uses ARG_PTR_TO_BTF_ID and may therefore be less than five arguments long. It usually points at a BTF_ID_LIST. Second, check_btf_id is a function pointer that is called by the verifier if present. It gets the actual BTF ID of the register, and the argument number we're currently checking. It turns out that the only user check_arg_btf_id ignores the argument, and is simply used to check whether the BTF ID has a struct sock_common at it's start. Replace both of these mechanisms with an explicit BTF ID for each argument in a function proto. Thanks to btf_struct_ids_match this is very flexible: check_arg_btf_id can be replaced by requiring struct sock_common. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-5-lmb@cloudflare.com --- net/ipv4/bpf_tcp_ca.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index e3939f76b024..74a2ef598c31 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -28,23 +28,18 @@ static u32 unsupported_ops[] = { static const struct btf_type *tcp_sock_type; static u32 tcp_sock_id, sock_id; -static int btf_sk_storage_get_ids[5]; static struct bpf_func_proto btf_sk_storage_get_proto __read_mostly; - -static int btf_sk_storage_delete_ids[5]; static struct bpf_func_proto btf_sk_storage_delete_proto __read_mostly; -static void convert_sk_func_proto(struct bpf_func_proto *to, int *to_btf_ids, - const struct bpf_func_proto *from) +static void convert_sk_func_proto(struct bpf_func_proto *to, const struct bpf_func_proto *from) { int i; *to = *from; - to->btf_id = to_btf_ids; for (i = 0; i < ARRAY_SIZE(to->arg_type); i++) { if (to->arg_type[i] == ARG_PTR_TO_SOCKET) { to->arg_type[i] = ARG_PTR_TO_BTF_ID; - to->btf_id[i] = tcp_sock_id; + to->arg_btf_id[i] = &tcp_sock_id; } } } @@ -64,12 +59,8 @@ static int bpf_tcp_ca_init(struct btf *btf) tcp_sock_id = type_id; tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); - convert_sk_func_proto(&btf_sk_storage_get_proto, - btf_sk_storage_get_ids, - &bpf_sk_storage_get_proto); - convert_sk_func_proto(&btf_sk_storage_delete_proto, - btf_sk_storage_delete_ids, - &bpf_sk_storage_delete_proto); + convert_sk_func_proto(&btf_sk_storage_get_proto, &bpf_sk_storage_get_proto); + convert_sk_func_proto(&btf_sk_storage_delete_proto, &bpf_sk_storage_delete_proto); return 0; } @@ -185,8 +176,8 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = { /* In case we want to report error later */ .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &tcp_sock_id, .arg2_type = ARG_ANYTHING, - .btf_id = &tcp_sock_id, }; static const struct bpf_func_proto * -- cgit v1.2.3