summaryrefslogtreecommitdiffstats
path: root/net/ipv4/inet_connection_sock.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-25 12:22:58 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-25 12:22:58 -0700
commit7e062cda7d90543ac8c7700fc7c5527d0c0f22ad (patch)
tree2f1602595d9416be41cc2e88a659ba4c145734b9 /net/ipv4/inet_connection_sock.c
parent5d1772b1739b085721431eef0c0400f3aff01abf (diff)
parent57d7becda9c9e612e6b00676f2eecfac3e719e88 (diff)
downloadlinux-7e062cda7d90543ac8c7700fc7c5527d0c0f22ad.tar.gz
linux-7e062cda7d90543ac8c7700fc7c5527d0c0f22ad.tar.bz2
linux-7e062cda7d90543ac8c7700fc7c5527d0c0f22ad.zip
Merge tag 'net-next-5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core ---- - Support TCPv6 segmentation offload with super-segments larger than 64k bytes using the IPv6 Jumbogram extension header (AKA BIG TCP). - Generalize skb freeing deferral to per-cpu lists, instead of per-socket lists. - Add a netdev statistic for packets dropped due to L2 address mismatch (rx_otherhost_dropped). - Continue work annotating skb drop reasons. - Accept alternative netdev names (ALT_IFNAME) in more netlink requests. - Add VLAN support for AF_PACKET SOCK_RAW GSO. - Allow receiving skb mark from the socket as a cmsg. - Enable memcg accounting for veth queues, sysctl tables and IPv6. BPF --- - Add libbpf support for User Statically-Defined Tracing (USDTs). - Speed up symbol resolution for kprobes multi-link attachments. - Support storing typed pointers to referenced and unreferenced objects in BPF maps. - Add support for BPF link iterator. - Introduce access to remote CPU map elements in BPF per-cpu map. - Allow middle-of-the-road settings for the kernel.unprivileged_bpf_disabled sysctl. - Implement basic types of dynamic pointers e.g. to allow for dynamically sized ringbuf reservations without extra memory copies. Protocols --------- - Retire port only listening_hash table, add a second bind table hashed by port and address. Avoid linear list walk when binding to very popular ports (e.g. 443). - Add bridge FDB bulk flush filtering support allowing user space to remove all FDB entries matching a condition. - Introduce accept_unsolicited_na sysctl for IPv6 to implement router-side changes for RFC9131. - Support for MPTCP path manager in user space. - Add MPTCP support for fallback to regular TCP for connections that have never connected additional subflows or transmitted out-of-sequence data (partial support for RFC8684 fallback). - Avoid races in MPTCP-level window tracking, stabilize and improve throughput. - Support lockless operation of GRE tunnels with seq numbers enabled. - WiFi support for host based BSS color collision detection. - Add support for SO_TXTIME/SCM_TXTIME on CAN sockets. - Support transmission w/o flow control in CAN ISOTP (ISO 15765-2). - Support zero-copy Tx with TLS 1.2 crypto offload (sendfile). - Allow matching on the number of VLAN tags via tc-flower. - Add tracepoint for tcp_set_ca_state(). Driver API ---------- - Improve error reporting from classifier and action offload. - Add support for listing line cards in switches (devlink). - Add helpers for reporting page pool statistics with ethtool -S. - Add support for reading clock cycles when using PTP virtual clocks, instead of having the driver convert to time before reporting. This makes it possible to report time from different vclocks. - Support configuring low-latency Tx descriptor push via ethtool. - Separate Clause 22 and Clause 45 MDIO accesses more explicitly. New hardware / drivers ---------------------- - Ethernet: - Marvell's Octeon NIC PCI Endpoint support (octeon_ep) - Sunplus SP7021 SoC (sp7021_emac) - Add support for Renesas RZ/V2M (in ravb) - Add support for MediaTek mt7986 switches (in mtk_eth_soc) - Ethernet PHYs: - ADIN1100 industrial PHYs (w/ 10BASE-T1L and SQI reporting) - TI DP83TD510 PHY - Microchip LAN8742/LAN88xx PHYs - WiFi: - Driver for pureLiFi X, XL, XC devices (plfxlc) - Driver for Silicon Labs devices (wfx) - Support for WCN6750 (in ath11k) - Support Realtek 8852ce devices (in rtw89) - Mobile: - MediaTek T700 modems (Intel 5G 5000 M.2 cards) - CAN: - ctucanfd: add support for CTU CAN FD open-source IP core from Czech Technical University in Prague Drivers ------- - Delete a number of old drivers still using virt_to_bus(). - Ethernet NICs: - intel: support TSO on tunnels MPLS - broadcom: support multi-buffer XDP - nfp: support VF rate limiting - sfc: use hardware tx timestamps for more than PTP - mlx5: multi-port eswitch support - hyper-v: add support for XDP_REDIRECT - atlantic: XDP support (including multi-buffer) - macb: improve real-time perf by deferring Tx processing to NAPI - High-speed Ethernet switches: - mlxsw: implement basic line card information querying - prestera: add support for traffic policing on ingress and egress - Embedded Ethernet switches: - lan966x: add support for packet DMA (FDMA) - lan966x: add support for PTP programmable pins - ti: cpsw_new: enable bc/mc storm prevention - Qualcomm 802.11ax WiFi (ath11k): - Wake-on-WLAN support for QCA6390 and WCN6855 - device recovery (firmware restart) support - support setting Specific Absorption Rate (SAR) for WCN6855 - read country code from SMBIOS for WCN6855/QCA6390 - enable keep-alive during WoWLAN suspend - implement remain-on-channel support - MediaTek WiFi (mt76): - support Wireless Ethernet Dispatch offloading packet movement between the Ethernet switch and WiFi interfaces - non-standard VHT MCS10-11 support - mt7921 AP mode support - mt7921 IPv6 NS offload support - Ethernet PHYs: - micrel: ksz9031/ksz9131: cabletest support - lan87xx: SQI support for T1 PHYs - lan937x: add interrupt support for link detection" * tag 'net-next-5.19' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1809 commits) ptp: ocp: Add firmware header checks ptp: ocp: fix PPS source selector debugfs reporting ptp: ocp: add .init function for sma_op vector ptp: ocp: vectorize the sma accessor functions ptp: ocp: constify selectors ptp: ocp: parameterize input/output sma selectors ptp: ocp: revise firmware display ptp: ocp: add Celestica timecard PCI ids ptp: ocp: Remove #ifdefs around PCI IDs ptp: ocp: 32-bit fixups for pci start address Revert "net/smc: fix listen processing for SMC-Rv2" ath6kl: Use cc-disable-warning to disable -Wdangling-pointer selftests/bpf: Dynptr tests bpf: Add dynptr data slices bpf: Add bpf_dynptr_read and bpf_dynptr_write bpf: Dynptr support for ring buffers bpf: Add bpf_dynptr_from_mem for local dynptrs bpf: Add verifier support for dynptrs bpf: Suppress 'passing zero to PTR_ERR' warning bpf: Introduce bpf_arch_text_invalidate for bpf_prog_pack ...
Diffstat (limited to 'net/ipv4/inet_connection_sock.c')
-rw-r--r--net/ipv4/inet_connection_sock.c245
1 files changed, 184 insertions, 61 deletions
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 1e5b53c2bb26..c0b7e6c21360 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -117,6 +117,32 @@ bool inet_rcv_saddr_any(const struct sock *sk)
return !sk->sk_rcv_saddr;
}
+static bool use_bhash2_on_bind(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ int addr_type;
+
+ if (sk->sk_family == AF_INET6) {
+ addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
+ return addr_type != IPV6_ADDR_ANY &&
+ addr_type != IPV6_ADDR_MAPPED;
+ }
+#endif
+ return sk->sk_rcv_saddr != htonl(INADDR_ANY);
+}
+
+static u32 get_bhash2_nulladdr_hash(const struct sock *sk, struct net *net,
+ int port)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct in6_addr nulladdr = {};
+
+ if (sk->sk_family == AF_INET6)
+ return ipv6_portaddr_hash(net, &nulladdr, port);
+#endif
+ return ipv4_portaddr_hash(net, 0, port);
+}
+
void inet_get_local_port_range(struct net *net, int *low, int *high)
{
unsigned int seq;
@@ -130,16 +156,71 @@ void inet_get_local_port_range(struct net *net, int *low, int *high)
}
EXPORT_SYMBOL(inet_get_local_port_range);
-static int inet_csk_bind_conflict(const struct sock *sk,
- const struct inet_bind_bucket *tb,
- bool relax, bool reuseport_ok)
+static bool bind_conflict_exist(const struct sock *sk, struct sock *sk2,
+ kuid_t sk_uid, bool relax,
+ bool reuseport_cb_ok, bool reuseport_ok)
+{
+ int bound_dev_if2;
+
+ if (sk == sk2)
+ return false;
+
+ bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);
+
+ if (!sk->sk_bound_dev_if || !bound_dev_if2 ||
+ sk->sk_bound_dev_if == bound_dev_if2) {
+ if (sk->sk_reuse && sk2->sk_reuse &&
+ sk2->sk_state != TCP_LISTEN) {
+ if (!relax || (!reuseport_ok && sk->sk_reuseport &&
+ sk2->sk_reuseport && reuseport_cb_ok &&
+ (sk2->sk_state == TCP_TIME_WAIT ||
+ uid_eq(sk_uid, sock_i_uid(sk2)))))
+ return true;
+ } else if (!reuseport_ok || !sk->sk_reuseport ||
+ !sk2->sk_reuseport || !reuseport_cb_ok ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(sk_uid, sock_i_uid(sk2)))) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool check_bhash2_conflict(const struct sock *sk,
+ struct inet_bind2_bucket *tb2, kuid_t sk_uid,
+ bool relax, bool reuseport_cb_ok,
+ bool reuseport_ok)
{
struct sock *sk2;
- bool reuseport_cb_ok;
- bool reuse = sk->sk_reuse;
- bool reuseport = !!sk->sk_reuseport;
- struct sock_reuseport *reuseport_cb;
+
+ sk_for_each_bound_bhash2(sk2, &tb2->owners) {
+ if (sk->sk_family == AF_INET && ipv6_only_sock(sk2))
+ continue;
+
+ if (bind_conflict_exist(sk, sk2, sk_uid, relax,
+ reuseport_cb_ok, reuseport_ok))
+ return true;
+ }
+ return false;
+}
+
+/* This should be called only when the corresponding inet_bind_bucket spinlock
+ * is held
+ */
+static int inet_csk_bind_conflict(const struct sock *sk, int port,
+ struct inet_bind_bucket *tb,
+ struct inet_bind2_bucket *tb2, /* may be null */
+ bool relax, bool reuseport_ok)
+{
+ struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
kuid_t uid = sock_i_uid((struct sock *)sk);
+ struct sock_reuseport *reuseport_cb;
+ struct inet_bind2_hashbucket *head2;
+ bool reuseport_cb_ok;
+ struct sock *sk2;
+ struct net *net;
+ int l3mdev;
+ u32 hash;
rcu_read_lock();
reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
@@ -150,36 +231,42 @@ static int inet_csk_bind_conflict(const struct sock *sk,
/*
* Unlike other sk lookup places we do not check
* for sk_net here, since _all_ the socks listed
- * in tb->owners list belong to the same net - the
- * one this bucket belongs to.
+ * in tb->owners and tb2->owners list belong
+ * to the same net
*/
- sk_for_each_bound(sk2, &tb->owners) {
- if (sk != sk2 &&
- (!sk->sk_bound_dev_if ||
- !sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if (reuse && sk2->sk_reuse &&
- sk2->sk_state != TCP_LISTEN) {
- if ((!relax ||
- (!reuseport_ok &&
- reuseport && sk2->sk_reuseport &&
- reuseport_cb_ok &&
- (sk2->sk_state == TCP_TIME_WAIT ||
- uid_eq(uid, sock_i_uid(sk2))))) &&
- inet_rcv_saddr_equal(sk, sk2, true))
- break;
- } else if (!reuseport_ok ||
- !reuseport || !sk2->sk_reuseport ||
- !reuseport_cb_ok ||
- (sk2->sk_state != TCP_TIME_WAIT &&
- !uid_eq(uid, sock_i_uid(sk2)))) {
- if (inet_rcv_saddr_equal(sk, sk2, true))
- break;
- }
- }
+ if (!use_bhash2_on_bind(sk)) {
+ sk_for_each_bound(sk2, &tb->owners)
+ if (bind_conflict_exist(sk, sk2, uid, relax,
+ reuseport_cb_ok, reuseport_ok) &&
+ inet_rcv_saddr_equal(sk, sk2, true))
+ return true;
+
+ return false;
}
- return sk2 != NULL;
+
+ if (tb2 && check_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok,
+ reuseport_ok))
+ return true;
+
+ net = sock_net(sk);
+
+ /* check there's no conflict with an existing IPV6_ADDR_ANY (if ipv6) or
+ * INADDR_ANY (if ipv4) socket.
+ */
+ hash = get_bhash2_nulladdr_hash(sk, net, port);
+ head2 = &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
+
+ l3mdev = inet_sk_bound_l3mdev(sk);
+ inet_bind_bucket_for_each(tb2, &head2->chain)
+ if (check_bind2_bucket_match_nulladdr(tb2, net, port, l3mdev, sk))
+ break;
+
+ if (tb2 && check_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok,
+ reuseport_ok))
+ return true;
+
+ return false;
}
/*
@@ -187,16 +274,20 @@ static int inet_csk_bind_conflict(const struct sock *sk,
* inet_bind_hashbucket lock held.
*/
static struct inet_bind_hashbucket *
-inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret)
+inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret,
+ struct inet_bind2_bucket **tb2_ret,
+ struct inet_bind2_hashbucket **head2_ret, int *port_ret)
{
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
- int port = 0;
+ struct inet_bind2_hashbucket *head2;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
- bool relax = false;
int i, low, high, attempt_half;
+ struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb;
u32 remaining, offset;
+ bool relax = false;
+ int port = 0;
int l3mdev;
l3mdev = inet_sk_bound_l3mdev(sk);
@@ -235,10 +326,12 @@ other_parity_scan:
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
+ tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk,
+ &head2);
inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
- tb->port == port) {
- if (!inet_csk_bind_conflict(sk, tb, relax, false))
+ if (check_bind_bucket_match(tb, net, port, l3mdev)) {
+ if (!inet_csk_bind_conflict(sk, port, tb, tb2,
+ relax, false))
goto success;
goto next_port;
}
@@ -268,6 +361,8 @@ next_port:
success:
*port_ret = port;
*tb_ret = tb;
+ *tb2_ret = tb2;
+ *head2_ret = head2;
return head;
}
@@ -363,54 +458,81 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
- int ret = 1, port = snum;
+ bool bhash_created = false, bhash2_created = false;
+ struct inet_bind2_bucket *tb2 = NULL;
+ struct inet_bind2_hashbucket *head2;
+ struct inet_bind_bucket *tb = NULL;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
- struct inet_bind_bucket *tb = NULL;
+ int ret = 1, port = snum;
+ bool found_port = false;
int l3mdev;
l3mdev = inet_sk_bound_l3mdev(sk);
if (!port) {
- head = inet_csk_find_open_port(sk, &tb, &port);
+ head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port);
if (!head)
return ret;
+ if (tb && tb2)
+ goto success;
+ found_port = true;
+ } else {
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (check_bind_bucket_match(tb, net, port, l3mdev))
+ break;
+
+ tb2 = inet_bind2_bucket_find(hinfo, net, port, l3mdev, sk,
+ &head2);
+ }
+
+ if (!tb) {
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net,
+ head, port, l3mdev);
if (!tb)
- goto tb_not_found;
- goto success;
+ goto fail_unlock;
+ bhash_created = true;
}
- head = &hinfo->bhash[inet_bhashfn(net, port,
- hinfo->bhash_size)];
- spin_lock_bh(&head->lock);
- inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
- tb->port == port)
- goto tb_found;
-tb_not_found:
- tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port, l3mdev);
- if (!tb)
- goto fail_unlock;
-tb_found:
- if (!hlist_empty(&tb->owners)) {
+
+ if (!tb2) {
+ tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep,
+ net, head2, port, l3mdev, sk);
+ if (!tb2)
+ goto fail_unlock;
+ bhash2_created = true;
+ }
+
+ /* If we had to find an open port, we already checked for conflicts */
+ if (!found_port && !hlist_empty(&tb->owners)) {
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
if ((tb->fastreuse > 0 && reuse) ||
sk_reuseport_match(tb, sk))
goto success;
- if (inet_csk_bind_conflict(sk, tb, true, true))
+ if (inet_csk_bind_conflict(sk, port, tb, tb2, true, true))
goto fail_unlock;
}
success:
inet_csk_update_fastreuse(tb, sk);
if (!inet_csk(sk)->icsk_bind_hash)
- inet_bind_hash(sk, tb, port);
+ inet_bind_hash(sk, tb, tb2, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
+ WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2);
ret = 0;
fail_unlock:
+ if (ret) {
+ if (bhash_created)
+ inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
+ if (bhash2_created)
+ inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep,
+ tb2);
+ }
spin_unlock_bh(&head->lock);
return ret;
}
@@ -957,6 +1079,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
inet_sk_set_state(newsk, TCP_SYN_RECV);
newicsk->icsk_bind_hash = NULL;
+ newicsk->icsk_bind2_hash = NULL;
inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;