summaryrefslogtreecommitdiffstats
path: root/net/mptcp
diff options
context:
space:
mode:
Diffstat (limited to 'net/mptcp')
-rw-r--r--net/mptcp/mib.c8
-rw-r--r--net/mptcp/mib.h8
-rw-r--r--net/mptcp/mptcp_diag.c6
-rw-r--r--net/mptcp/options.c61
-rw-r--r--net/mptcp/pm.c22
-rw-r--r--net/mptcp/pm_netlink.c515
-rw-r--r--net/mptcp/protocol.c185
-rw-r--r--net/mptcp/protocol.h116
-rw-r--r--net/mptcp/subflow.c154
9 files changed, 901 insertions, 174 deletions
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index b921cbdd9aaa..3780c29c321d 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -29,8 +29,16 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA),
SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR),
SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD),
+ SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD),
+ SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX),
+ SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX),
+ SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX),
+ SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX),
+ SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX),
SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR),
SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW),
+ SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX),
+ SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX),
SNMP_MIB_SENTINEL
};
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index 47bcecce1106..72afbc135f8e 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -22,8 +22,16 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_DUPDATA, /* Segments discarded due to duplicate DSS */
MPTCP_MIB_ADDADDR, /* Received ADD_ADDR with echo-flag=0 */
MPTCP_MIB_ECHOADD, /* Received ADD_ADDR with echo-flag=1 */
+ MPTCP_MIB_PORTADD, /* Received ADD_ADDR with a port-number */
+ MPTCP_MIB_JOINPORTSYNRX, /* Received a SYN MP_JOIN with a different port-number */
+ MPTCP_MIB_JOINPORTSYNACKRX, /* Received a SYNACK MP_JOIN with a different port-number */
+ MPTCP_MIB_JOINPORTACKRX, /* Received an ACK MP_JOIN with a different port-number */
+ MPTCP_MIB_MISMATCHPORTSYNRX, /* Received a SYN MP_JOIN with a mismatched port-number */
+ MPTCP_MIB_MISMATCHPORTACKRX, /* Received an ACK MP_JOIN with a mismatched port-number */
MPTCP_MIB_RMADDR, /* Received RM_ADDR */
MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */
+ MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */
+ MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */
__MPTCP_MIB_MAX
};
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
index b70ae4ba3000..00ed742f48a4 100644
--- a/net/mptcp/mptcp_diag.c
+++ b/net/mptcp/mptcp_diag.c
@@ -128,10 +128,10 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
- info->mptcpi_subflows_max = READ_ONCE(msk->pm.subflows_max);
- val = READ_ONCE(msk->pm.add_addr_signal_max);
+ info->mptcpi_subflows_max = mptcp_pm_get_subflows_max(msk);
+ val = mptcp_pm_get_add_addr_signal_max(msk);
info->mptcpi_add_addr_signal_max = val;
- val = READ_ONCE(msk->pm.add_addr_accept_max);
+ val = mptcp_pm_get_add_addr_accept_max(msk);
info->mptcpi_add_addr_accepted_max = val;
if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags))
flags |= MPTCP_INFO_FLAG_FALLBACK;
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index e0d21c0607e5..3b71d68b3863 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -282,6 +282,15 @@ static void mptcp_parse_option(const struct sk_buff *skb,
pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
break;
+ case MPTCPOPT_MP_PRIO:
+ if (opsize != TCPOLEN_MPTCP_PRIO)
+ break;
+
+ mp_opt->mp_prio = 1;
+ mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP;
+ pr_debug("MP_PRIO: prio=%d", mp_opt->backup);
+ break;
+
case MPTCPOPT_MP_FASTCLOSE:
if (opsize != TCPOLEN_MPTCP_FASTCLOSE)
break;
@@ -313,6 +322,7 @@ void mptcp_get_options(const struct sk_buff *skb,
mp_opt->port = 0;
mp_opt->rm_addr = 0;
mp_opt->dss = 0;
+ mp_opt->mp_prio = 0;
length = (th->doff * 4) - sizeof(struct tcphdr);
ptr = (const unsigned char *)(th + 1);
@@ -679,6 +689,29 @@ static bool mptcp_established_options_rm_addr(struct sock *sk,
return true;
}
+static bool mptcp_established_options_mp_prio(struct sock *sk,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ if (!subflow->send_mp_prio)
+ return false;
+
+ /* account for the trailing 'nop' option */
+ if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN)
+ return false;
+
+ *size = TCPOLEN_MPTCP_PRIO_ALIGN;
+ opts->suboptions |= OPTION_MPTCP_PRIO;
+ opts->backup = subflow->request_bkup;
+
+ pr_debug("prio=%d", opts->backup);
+
+ return true;
+}
+
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int *size, unsigned int remaining,
struct mptcp_out_options *opts)
@@ -721,6 +754,12 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
ret = true;
}
+ if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) {
+ *size += opt_size;
+ remaining -= opt_size;
+ ret = true;
+ }
+
return ret;
}
@@ -986,6 +1025,10 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
mptcp_pm_del_add_timer(msk, &addr);
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD);
}
+
+ if (mp_opt.port)
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD);
+
mp_opt.add_addr = 0;
}
@@ -994,6 +1037,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
mp_opt.rm_addr = 0;
}
+ if (mp_opt.mp_prio) {
+ mptcp_pm_mp_prio_received(sk, mp_opt.backup);
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX);
+ mp_opt.mp_prio = 0;
+ }
+
if (!mp_opt.dss)
return;
@@ -1168,6 +1217,18 @@ mp_capable_done:
0, opts->rm_id);
}
+ if (OPTION_MPTCP_PRIO & opts->suboptions) {
+ const struct sock *ssk = (const struct sock *)tp;
+ struct mptcp_subflow_context *subflow;
+
+ subflow = mptcp_subflow_ctx(ssk);
+ subflow->send_mp_prio = 0;
+
+ *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO,
+ TCPOLEN_MPTCP_PRIO,
+ opts->backup, TCPOPT_NOP);
+ }
+
if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
TCPOLEN_MPTCP_MPJ_SYN,
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index da2ed576f289..1a25003fd8e3 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -20,6 +20,8 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk,
pr_debug("msk=%p, local_id=%d", msk, addr->id);
+ lockdep_assert_held(&msk->pm.lock);
+
if (add_addr) {
pr_warn("addr_signal error, add_addr=%d", add_addr);
return -EINVAL;
@@ -78,10 +80,13 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side)
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
+ unsigned int subflows_max;
int ret = 0;
+ subflows_max = mptcp_pm_get_subflows_max(msk);
+
pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
- pm->subflows_max, READ_ONCE(pm->accept_subflow));
+ subflows_max, READ_ONCE(pm->accept_subflow));
/* try to avoid acquiring the lock below */
if (!READ_ONCE(pm->accept_subflow))
@@ -89,8 +94,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
spin_lock_bh(&pm->lock);
if (READ_ONCE(pm->accept_subflow)) {
- ret = pm->subflows < pm->subflows_max;
- if (ret && ++pm->subflows == pm->subflows_max)
+ ret = pm->subflows < subflows_max;
+ if (ret && ++pm->subflows == subflows_max)
WRITE_ONCE(pm->accept_subflow, false);
}
spin_unlock_bh(&pm->lock);
@@ -188,8 +193,7 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk)
{
- if (!mptcp_pm_should_add_signal_ipv6(msk) &&
- !mptcp_pm_should_add_signal_port(msk))
+ if (!mptcp_pm_should_add_signal(msk))
return;
mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK);
@@ -207,6 +211,14 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id)
spin_unlock_bh(&pm->lock);
}
+void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
+ subflow->backup = bkup;
+}
+
/* path manager helpers */
bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index a6d983d80576..23780a13b934 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -26,6 +26,7 @@ struct mptcp_pm_addr_entry {
struct list_head list;
struct mptcp_addr_info addr;
struct rcu_head rcu;
+ struct socket *lsk;
};
struct mptcp_pm_add_entry {
@@ -36,6 +37,9 @@ struct mptcp_pm_add_entry {
u8 retrans_times;
};
+#define MAX_ADDR_ID 255
+#define BITMAP_SZ DIV_ROUND_UP(MAX_ADDR_ID + 1, BITS_PER_LONG)
+
struct pm_nl_pernet {
/* protects pernet updates */
spinlock_t lock;
@@ -46,6 +50,7 @@ struct pm_nl_pernet {
unsigned int local_addr_max;
unsigned int subflows_max;
unsigned int next_id;
+ unsigned long id_bitmap[BITMAP_SZ];
};
#define MPTCP_PM_ADDR_MAX 8
@@ -56,15 +61,20 @@ static bool addresses_equal(const struct mptcp_addr_info *a,
{
bool addr_equals = false;
- if (a->family != b->family)
- return false;
-
- if (a->family == AF_INET)
- addr_equals = a->addr.s_addr == b->addr.s_addr;
+ if (a->family == b->family) {
+ if (a->family == AF_INET)
+ addr_equals = a->addr.s_addr == b->addr.s_addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- else
- addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6);
+ else
+ addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6);
+ } else if (a->family == AF_INET) {
+ if (ipv6_addr_v4mapped(&b->addr6))
+ addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3];
+ } else if (b->family == AF_INET) {
+ if (ipv6_addr_v4mapped(&a->addr6))
+ addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr;
#endif
+ }
if (!addr_equals)
return false;
@@ -81,14 +91,14 @@ static bool address_zero(const struct mptcp_addr_info *addr)
memset(&zero, 0, sizeof(zero));
zero.family = addr->family;
- return addresses_equal(addr, &zero, false);
+ return addresses_equal(addr, &zero, true);
}
static void local_address(const struct sock_common *skc,
struct mptcp_addr_info *addr)
{
- addr->port = 0;
addr->family = skc->skc_family;
+ addr->port = htons(skc->skc_num);
if (addr->family == AF_INET)
addr->addr.s_addr = skc->skc_rcv_saddr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -121,7 +131,7 @@ static bool lookup_subflow_by_saddr(const struct list_head *list,
skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
local_address(skc, &cur);
- if (addresses_equal(&cur, saddr, false))
+ if (addresses_equal(&cur, saddr, saddr->port))
return true;
}
@@ -133,6 +143,9 @@ select_local_address(const struct pm_nl_pernet *pernet,
struct mptcp_sock *msk)
{
struct mptcp_pm_addr_entry *entry, *ret = NULL;
+ struct sock *sk = (struct sock *)msk;
+
+ msk_owned_by_me(msk);
rcu_read_lock();
__mptcp_flush_join_list(msk);
@@ -140,11 +153,20 @@ select_local_address(const struct pm_nl_pernet *pernet,
if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
+ if (entry->addr.family != sk->sk_family) {
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if ((entry->addr.family == AF_INET &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) ||
+ (sk->sk_family == AF_INET &&
+ !ipv6_addr_v4mapped(&entry->addr.addr6)))
+#endif
+ continue;
+ }
+
/* avoid any address already in use by subflows and
* pending join
*/
- if (entry->addr.family == ((struct sock *)msk)->sk_family &&
- !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) {
+ if (!lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) {
ret = entry;
break;
}
@@ -177,11 +199,46 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
return ret;
}
+unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+ return READ_ONCE(pernet->add_addr_signal_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max);
+
+unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+ return READ_ONCE(pernet->add_addr_accept_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max);
+
+unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+ return READ_ONCE(pernet->subflows_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max);
+
+static unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+ return READ_ONCE(pernet->local_addr_max);
+}
+
static void check_work_pending(struct mptcp_sock *msk)
{
- if (msk->pm.add_addr_signaled == msk->pm.add_addr_signal_max &&
- (msk->pm.local_addr_used == msk->pm.local_addr_max ||
- msk->pm.subflows == msk->pm.subflows_max))
+ if (msk->pm.add_addr_signaled == mptcp_pm_get_add_addr_signal_max(msk) &&
+ (msk->pm.local_addr_used == mptcp_pm_get_local_addr_max(msk) ||
+ msk->pm.subflows == mptcp_pm_get_subflows_max(msk)))
WRITE_ONCE(msk->pm.work_pending, false);
}
@@ -191,14 +248,37 @@ lookup_anno_list_by_saddr(struct mptcp_sock *msk,
{
struct mptcp_pm_add_entry *entry;
+ lockdep_assert_held(&msk->pm.lock);
+
list_for_each_entry(entry, &msk->pm.anno_list, list) {
- if (addresses_equal(&entry->addr, addr, false))
+ if (addresses_equal(&entry->addr, addr, true))
return entry;
}
return NULL;
}
+bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk)
+{
+ struct mptcp_pm_add_entry *entry;
+ struct mptcp_addr_info saddr;
+ bool ret = false;
+
+ local_address((struct sock_common *)sk, &saddr);
+
+ spin_lock_bh(&msk->pm.lock);
+ list_for_each_entry(entry, &msk->pm.anno_list, list) {
+ if (addresses_equal(&entry->addr, &saddr, true)) {
+ ret = true;
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock_bh(&msk->pm.lock);
+ return ret;
+}
+
static void mptcp_pm_add_timer(struct timer_list *timer)
{
struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer);
@@ -266,6 +346,8 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
struct sock *sk = (struct sock *)msk;
struct net *net = sock_net(sk);
+ lockdep_assert_held(&msk->pm.lock);
+
if (lookup_anno_list_by_saddr(msk, &entry->addr))
return false;
@@ -306,20 +388,26 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
{
- struct mptcp_addr_info remote = { 0 };
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_addr_entry *local;
+ unsigned int add_addr_signal_max;
+ unsigned int local_addr_max;
struct pm_nl_pernet *pernet;
+ unsigned int subflows_max;
pernet = net_generic(sock_net(sk), pm_nl_pernet_id);
+ add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk);
+ local_addr_max = mptcp_pm_get_local_addr_max(msk);
+ subflows_max = mptcp_pm_get_subflows_max(msk);
+
pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
- msk->pm.local_addr_used, msk->pm.local_addr_max,
- msk->pm.add_addr_signaled, msk->pm.add_addr_signal_max,
- msk->pm.subflows, msk->pm.subflows_max);
+ msk->pm.local_addr_used, local_addr_max,
+ msk->pm.add_addr_signaled, add_addr_signal_max,
+ msk->pm.subflows, subflows_max);
/* check first for announce */
- if (msk->pm.add_addr_signaled < msk->pm.add_addr_signal_max) {
+ if (msk->pm.add_addr_signaled < add_addr_signal_max) {
local = select_signal_address(pernet,
msk->pm.add_addr_signaled);
@@ -331,22 +419,23 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
}
} else {
/* pick failed, avoid fourther attempts later */
- msk->pm.local_addr_used = msk->pm.add_addr_signal_max;
+ msk->pm.local_addr_used = add_addr_signal_max;
}
check_work_pending(msk);
}
/* check if should create a new subflow */
- if (msk->pm.local_addr_used < msk->pm.local_addr_max &&
- msk->pm.subflows < msk->pm.subflows_max) {
- remote_address((struct sock_common *)sk, &remote);
-
+ if (msk->pm.local_addr_used < local_addr_max &&
+ msk->pm.subflows < subflows_max) {
local = select_local_address(pernet, msk);
if (local) {
+ struct mptcp_addr_info remote = { 0 };
+
msk->pm.local_addr_used++;
msk->pm.subflows++;
check_work_pending(msk);
+ remote_address((struct sock_common *)sk, &remote);
spin_unlock_bh(&msk->pm.lock);
__mptcp_subflow_connect(sk, &local->addr, &remote);
spin_lock_bh(&msk->pm.lock);
@@ -354,7 +443,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
}
/* lookup failed, avoid fourther attempts later */
- msk->pm.local_addr_used = msk->pm.local_addr_max;
+ msk->pm.local_addr_used = local_addr_max;
check_work_pending(msk);
}
}
@@ -372,17 +461,22 @@ void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
{
struct sock *sk = (struct sock *)msk;
+ unsigned int add_addr_accept_max;
struct mptcp_addr_info remote;
struct mptcp_addr_info local;
+ unsigned int subflows_max;
bool use_port = false;
+ add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk);
+ subflows_max = mptcp_pm_get_subflows_max(msk);
+
pr_debug("accepted %d:%d remote family %d",
- msk->pm.add_addr_accepted, msk->pm.add_addr_accept_max,
+ msk->pm.add_addr_accepted, add_addr_accept_max,
msk->pm.remote.family);
msk->pm.add_addr_accepted++;
msk->pm.subflows++;
- if (msk->pm.add_addr_accepted >= msk->pm.add_addr_accept_max ||
- msk->pm.subflows >= msk->pm.subflows_max)
+ if (msk->pm.add_addr_accepted >= add_addr_accept_max ||
+ msk->pm.subflows >= subflows_max)
WRITE_ONCE(msk->pm.accept_addr, false);
/* connect to the specified remote address, using whatever
@@ -408,8 +502,10 @@ void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
- if (!mptcp_pm_should_add_signal_ipv6(msk) &&
- !mptcp_pm_should_add_signal_port(msk))
+ msk_owned_by_me(msk);
+ lockdep_assert_held(&msk->pm.lock);
+
+ if (!mptcp_pm_should_add_signal(msk))
return;
__mptcp_flush_join_list(msk);
@@ -419,10 +515,9 @@ void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk)
u8 add_addr;
spin_unlock_bh(&msk->pm.lock);
- if (mptcp_pm_should_add_signal_ipv6(msk))
- pr_debug("send ack for add_addr6");
- if (mptcp_pm_should_add_signal_port(msk))
- pr_debug("send ack for add_addr_port");
+ pr_debug("send ack for add_addr%s%s",
+ mptcp_pm_should_add_signal_ipv6(msk) ? " [ipv6]" : "",
+ mptcp_pm_should_add_signal_port(msk) ? " [port]" : "");
lock_sock(ssk);
tcp_send_ack(ssk);
@@ -438,6 +533,41 @@ void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk)
}
}
+int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ u8 bkup)
+{
+ struct mptcp_subflow_context *subflow;
+
+ pr_debug("bkup=%d", bkup);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_addr_info local;
+
+ local_address((struct sock_common *)ssk, &local);
+ if (!addresses_equal(&local, addr, addr->port))
+ continue;
+
+ subflow->backup = bkup;
+ subflow->send_mp_prio = 1;
+ subflow->request_bkup = bkup;
+ __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIOTX);
+
+ spin_unlock_bh(&msk->pm.lock);
+ pr_debug("send ack for mp_prio");
+ lock_sock(ssk);
+ tcp_send_ack(ssk);
+ release_sock(ssk);
+ spin_lock_bh(&msk->pm.lock);
+
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow, *tmp;
@@ -445,6 +575,8 @@ void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk)
pr_debug("address rm_id %d", msk->pm.rm_id);
+ msk_owned_by_me(msk);
+
if (!msk->pm.rm_id)
return;
@@ -480,6 +612,8 @@ void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id)
pr_debug("subflow rm_id %d", rm_id);
+ msk_owned_by_me(msk);
+
if (!rm_id)
return;
@@ -518,16 +652,19 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
struct mptcp_pm_addr_entry *entry)
{
struct mptcp_pm_addr_entry *cur;
+ unsigned int addr_max;
int ret = -EINVAL;
spin_lock_bh(&pernet->lock);
/* to keep the code simple, don't do IDR-like allocation for address ID,
* just bail when we exceed limits
*/
- if (pernet->next_id > 255)
- goto out;
+ if (pernet->next_id == MAX_ADDR_ID)
+ pernet->next_id = 1;
if (pernet->addrs >= MPTCP_PM_ADDR_MAX)
goto out;
+ if (test_bit(entry->addr.id, pernet->id_bitmap))
+ goto out;
/* do not insert duplicate address, differentiate on port only
* singled addresses
@@ -539,12 +676,34 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
goto out;
}
- if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
- pernet->add_addr_signal_max++;
- if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
- pernet->local_addr_max++;
+ if (!entry->addr.id) {
+find_next:
+ entry->addr.id = find_next_zero_bit(pernet->id_bitmap,
+ MAX_ADDR_ID + 1,
+ pernet->next_id);
+ if ((!entry->addr.id || entry->addr.id > MAX_ADDR_ID) &&
+ pernet->next_id != 1) {
+ pernet->next_id = 1;
+ goto find_next;
+ }
+ }
+
+ if (!entry->addr.id || entry->addr.id > MAX_ADDR_ID)
+ goto out;
+
+ __set_bit(entry->addr.id, pernet->id_bitmap);
+ if (entry->addr.id > pernet->next_id)
+ pernet->next_id = entry->addr.id;
+
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
+ addr_max = pernet->add_addr_signal_max;
+ WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1);
+ }
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ addr_max = pernet->local_addr_max;
+ WRITE_ONCE(pernet->local_addr_max, addr_max + 1);
+ }
- entry->addr.id = pernet->next_id++;
pernet->addrs++;
list_add_tail_rcu(&entry->list, &pernet->local_addr_list);
ret = entry->addr.id;
@@ -554,6 +713,53 @@ out:
return ret;
}
+static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct sockaddr_storage addr;
+ struct mptcp_sock *msk;
+ struct socket *ssock;
+ int backlog = 1024;
+ int err;
+
+ err = sock_create_kern(sock_net(sk), entry->addr.family,
+ SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk);
+ if (err)
+ return err;
+
+ msk = mptcp_sk(entry->lsk->sk);
+ if (!msk) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ ssock = __mptcp_nmpc_socket(msk);
+ if (!ssock) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
+ err = kernel_bind(ssock, (struct sockaddr *)&addr,
+ sizeof(struct sockaddr_in));
+ if (err) {
+ pr_warn("kernel_bind error, err=%d", err);
+ goto out;
+ }
+
+ err = kernel_listen(ssock, backlog);
+ if (err) {
+ pr_warn("kernel_listen error, err=%d", err);
+ goto out;
+ }
+
+ return 0;
+
+out:
+ sock_release(entry->lsk);
+ return err;
+}
+
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
{
struct mptcp_pm_addr_entry *entry;
@@ -580,7 +786,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
rcu_read_lock();
list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
- if (addresses_equal(&entry->addr, &skc_local, false)) {
+ if (addresses_equal(&entry->addr, &skc_local, entry->addr.port)) {
ret = entry->addr.id;
break;
}
@@ -597,6 +803,9 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
entry->addr = skc_local;
entry->addr.ifindex = 0;
entry->addr.flags = 0;
+ entry->addr.id = 0;
+ entry->addr.port = 0;
+ entry->lsk = NULL;
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0)
kfree(entry);
@@ -607,19 +816,12 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
void mptcp_pm_nl_data_init(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
- struct pm_nl_pernet *pernet;
bool subflows;
- pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
-
- pm->add_addr_signal_max = READ_ONCE(pernet->add_addr_signal_max);
- pm->add_addr_accept_max = READ_ONCE(pernet->add_addr_accept_max);
- pm->local_addr_max = READ_ONCE(pernet->local_addr_max);
- pm->subflows_max = READ_ONCE(pernet->subflows_max);
- subflows = !!pm->subflows_max;
- WRITE_ONCE(pm->work_pending, (!!pm->local_addr_max && subflows) ||
- !!pm->add_addr_signal_max);
- WRITE_ONCE(pm->accept_addr, !!pm->add_addr_accept_max && subflows);
+ subflows = !!mptcp_pm_get_subflows_max(msk);
+ WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_local_addr_max(msk) && subflows) ||
+ !!mptcp_pm_get_add_addr_signal_max(msk));
+ WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_add_addr_accept_max(msk) && subflows);
WRITE_ONCE(pm->accept_subflow, subflows);
}
@@ -722,6 +924,9 @@ skip_family:
if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
entry->addr.flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
+ if (tb[MPTCP_PM_ADDR_ATTR_PORT])
+ entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT]));
+
return 0;
}
@@ -730,6 +935,31 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
return net_generic(genl_info_net(info), pm_nl_pernet_id);
}
+static int mptcp_nl_add_subflow_or_signal_addr(struct net *net)
+{
+ struct mptcp_sock *msk;
+ long s_slot = 0, s_num = 0;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+
+ if (!READ_ONCE(msk->fully_established))
+ goto next;
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+
+ return 0;
+}
+
static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
@@ -748,13 +978,25 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
}
*entry = addr;
+ if (entry->addr.port) {
+ ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry);
+ if (ret) {
+ GENL_SET_ERR_MSG(info, "create listen socket error");
+ kfree(entry);
+ return ret;
+ }
+ }
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0) {
GENL_SET_ERR_MSG(info, "too many addresses or duplicate one");
+ if (entry->lsk)
+ sock_release(entry->lsk);
kfree(entry);
return ret;
}
+ mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk));
+
return 0;
}
@@ -832,11 +1074,44 @@ next:
return 0;
}
+struct addr_entry_release_work {
+ struct rcu_work rwork;
+ struct mptcp_pm_addr_entry *entry;
+};
+
+static void mptcp_pm_release_addr_entry(struct work_struct *work)
+{
+ struct addr_entry_release_work *w;
+ struct mptcp_pm_addr_entry *entry;
+
+ w = container_of(to_rcu_work(work), struct addr_entry_release_work, rwork);
+ entry = w->entry;
+ if (entry) {
+ if (entry->lsk)
+ sock_release(entry->lsk);
+ kfree(entry);
+ }
+ kfree(w);
+}
+
+static void mptcp_pm_free_addr_entry(struct mptcp_pm_addr_entry *entry)
+{
+ struct addr_entry_release_work *w;
+
+ w = kmalloc(sizeof(*w), GFP_ATOMIC);
+ if (w) {
+ INIT_RCU_WORK(&w->rwork, mptcp_pm_release_addr_entry);
+ w->entry = entry;
+ queue_rcu_work(system_wq, &w->rwork);
+ }
+}
+
static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct mptcp_pm_addr_entry addr, *entry;
+ unsigned int addr_max;
int ret;
ret = mptcp_pm_parse_addr(attr, info, false, &addr);
@@ -850,17 +1125,22 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
spin_unlock_bh(&pernet->lock);
return -EINVAL;
}
- if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
- pernet->add_addr_signal_max--;
- if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
- pernet->local_addr_max--;
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
+ addr_max = pernet->add_addr_signal_max;
+ WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1);
+ }
+ if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ addr_max = pernet->local_addr_max;
+ WRITE_ONCE(pernet->local_addr_max, addr_max - 1);
+ }
pernet->addrs--;
list_del_rcu(&entry->list);
+ __clear_bit(entry->addr.id, pernet->id_bitmap);
spin_unlock_bh(&pernet->lock);
mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), &entry->addr);
- kfree_rcu(entry, rcu);
+ mptcp_pm_free_addr_entry(entry);
return ret;
}
@@ -874,15 +1154,15 @@ static void __flush_addrs(struct net *net, struct list_head *list)
struct mptcp_pm_addr_entry, list);
mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr);
list_del_rcu(&cur->list);
- kfree_rcu(cur, rcu);
+ mptcp_pm_free_addr_entry(cur);
}
}
static void __reset_counters(struct pm_nl_pernet *pernet)
{
- pernet->add_addr_signal_max = 0;
- pernet->add_addr_accept_max = 0;
- pernet->local_addr_max = 0;
+ WRITE_ONCE(pernet->add_addr_signal_max, 0);
+ WRITE_ONCE(pernet->add_addr_accept_max, 0);
+ WRITE_ONCE(pernet->local_addr_max, 0);
pernet->addrs = 0;
}
@@ -894,6 +1174,8 @@ static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
spin_lock_bh(&pernet->lock);
list_splice_init(&pernet->local_addr_list, &free_list);
__reset_counters(pernet);
+ pernet->next_id = 1;
+ bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1);
spin_unlock_bh(&pernet->lock);
__flush_addrs(sock_net(skb->sk), &free_list);
return 0;
@@ -911,6 +1193,8 @@ static int mptcp_nl_fill_addr(struct sk_buff *skb,
if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family))
goto nla_put_failure;
+ if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_PORT, ntohs(addr->port)))
+ goto nla_put_failure;
if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id))
goto nla_put_failure;
if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->addr.flags))
@@ -994,27 +1278,34 @@ static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg,
struct pm_nl_pernet *pernet;
int id = cb->args[0];
void *hdr;
+ int i;
pernet = net_generic(net, pm_nl_pernet_id);
spin_lock_bh(&pernet->lock);
- list_for_each_entry(entry, &pernet->local_addr_list, list) {
- if (entry->addr.id <= id)
- continue;
-
- hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, &mptcp_genl_family,
- NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR);
- if (!hdr)
- break;
+ for (i = id; i < MAX_ADDR_ID + 1; i++) {
+ if (test_bit(i, pernet->id_bitmap)) {
+ entry = __lookup_addr_by_id(pernet, i);
+ if (!entry)
+ break;
+
+ if (entry->addr.id <= id)
+ continue;
+
+ hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &mptcp_genl_family,
+ NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR);
+ if (!hdr)
+ break;
+
+ if (mptcp_nl_fill_addr(msg, entry) < 0) {
+ genlmsg_cancel(msg, hdr);
+ break;
+ }
- if (mptcp_nl_fill_addr(msg, entry) < 0) {
- genlmsg_cancel(msg, hdr);
- break;
+ id = entry->addr.id;
+ genlmsg_end(msg, hdr);
}
-
- id = entry->addr.id;
- genlmsg_end(msg, hdr);
}
spin_unlock_bh(&pernet->lock);
@@ -1096,6 +1387,66 @@ fail:
return -EMSGSIZE;
}
+static int mptcp_nl_addr_backup(struct net *net,
+ struct mptcp_addr_info *addr,
+ u8 bkup)
+{
+ long s_slot = 0, s_num = 0;
+ struct mptcp_sock *msk;
+ int ret = -EINVAL;
+
+ while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) {
+ struct sock *sk = (struct sock *)msk;
+
+ if (list_empty(&msk->conn_list))
+ goto next;
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+ ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, bkup);
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+
+next:
+ sock_put(sk);
+ cond_resched();
+ }
+
+ return ret;
+}
+
+static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ struct net *net = sock_net(skb->sk);
+ u8 bkup = 0;
+ int ret;
+
+ ret = mptcp_pm_parse_addr(attr, info, true, &addr);
+ if (ret < 0)
+ return ret;
+
+ if (addr.addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
+ bkup = 1;
+
+ list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ if (addresses_equal(&entry->addr, &addr.addr, true)) {
+ ret = mptcp_nl_addr_backup(net, &entry->addr, bkup);
+ if (ret)
+ return ret;
+
+ if (bkup)
+ entry->addr.flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
+ else
+ entry->addr.flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP;
+ }
+ }
+
+ return 0;
+}
+
static const struct genl_small_ops mptcp_pm_ops[] = {
{
.cmd = MPTCP_PM_CMD_ADD_ADDR,
@@ -1126,6 +1477,11 @@ static const struct genl_small_ops mptcp_pm_ops[] = {
.cmd = MPTCP_PM_CMD_GET_LIMITS,
.doit = mptcp_nl_cmd_get_limits,
},
+ {
+ .cmd = MPTCP_PM_CMD_SET_FLAGS,
+ .doit = mptcp_nl_cmd_set_flags,
+ .flags = GENL_ADMIN_PERM,
+ },
};
static struct genl_family mptcp_genl_family __ro_after_init = {
@@ -1148,6 +1504,7 @@ static int __net_init pm_nl_init_net(struct net *net)
INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
__reset_counters(pernet);
pernet->next_id = 1;
+ bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1);
spin_lock_init(&pernet->lock);
return 0;
}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index f998a077c7dd..b9f16a1535d2 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -45,11 +45,14 @@ static struct percpu_counter mptcp_sockets_allocated;
static void __mptcp_destroy_sock(struct sock *sk);
static void __mptcp_check_send_data_fin(struct sock *sk);
+DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
+static struct net_device mptcp_napi_dev;
+
/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
* completed yet or has failed, return the subflow socket.
* Otherwise return NULL.
*/
-static struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
+struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk)
{
if (!msk->subflow || READ_ONCE(msk->can_ack))
return NULL;
@@ -114,11 +117,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk)
list_add(&subflow->node, &msk->conn_list);
sock_hold(ssock->sk);
subflow->request_mptcp = 1;
-
- /* accept() will wait on first subflow sk_wq, and we always wakes up
- * via msk->sk_socket
- */
- RCU_INIT_POINTER(msk->first->sk_wq, &sk->sk_socket->wq);
+ mptcp_sock_graft(msk->first, sk->sk_socket);
return 0;
}
@@ -734,10 +733,14 @@ wake:
void __mptcp_flush_join_list(struct mptcp_sock *msk)
{
+ struct mptcp_subflow_context *subflow;
+
if (likely(list_empty(&msk->join_list)))
return;
spin_lock_bh(&msk->join_list_lock);
+ list_for_each_entry(subflow, &msk->join_list, node)
+ mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
list_splice_tail_init(&msk->join_list, &msk->conn_list);
spin_unlock_bh(&msk->join_list_lock);
}
@@ -1037,13 +1040,6 @@ out:
__mptcp_update_wmem(sk);
sk_mem_reclaim_partial(sk);
}
-
- if (sk_stream_is_writeable(sk)) {
- /* pairs with memory barrier in mptcp_poll */
- smp_mb();
- if (test_and_clear_bit(MPTCP_NOSPACE, &msk->flags))
- sk_stream_write_space(sk);
- }
}
if (snd_una == READ_ONCE(msk->snd_nxt)) {
@@ -1362,8 +1358,7 @@ struct subflow_send_info {
u64 ratio;
};
-static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
- u32 *sndbuf)
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
struct subflow_send_info send_info[2];
struct mptcp_subflow_context *subflow;
@@ -1374,24 +1369,17 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
sock_owned_by_me((struct sock *)msk);
- *sndbuf = 0;
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
return NULL;
- *sndbuf = msk->first->sk_sndbuf;
return sk_stream_memory_free(msk->first) ? msk->first : NULL;
}
/* re-use last subflow, if the burst allow that */
if (msk->last_snd && msk->snd_burst > 0 &&
sk_stream_memory_free(msk->last_snd) &&
- mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
- mptcp_for_each_subflow(msk, subflow) {
- ssk = mptcp_subflow_tcp_sock(subflow);
- *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
- }
+ mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd)))
return msk->last_snd;
- }
/* pick the subflow with the lower wmem/wspace ratio */
for (i = 0; i < 2; ++i) {
@@ -1404,8 +1392,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
continue;
nr_active += !subflow->backup;
- *sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
- if (!sk_stream_memory_free(subflow->tcp_sock))
+ if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd)
continue;
pace = READ_ONCE(ssk->sk_pacing_rate);
@@ -1431,9 +1418,10 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
if (send_info[0].ssk) {
msk->last_snd = send_info[0].ssk;
msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
- sk_stream_wspace(msk->last_snd));
+ tcp_sk(msk->last_snd)->snd_wnd);
return msk->last_snd;
}
+
return NULL;
}
@@ -1454,7 +1442,6 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags)
};
struct mptcp_data_frag *dfrag;
int len, copied = 0;
- u32 sndbuf;
while ((dfrag = mptcp_send_head(sk))) {
info.sent = dfrag->already_sent;
@@ -1465,12 +1452,7 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags)
prev_ssk = ssk;
__mptcp_flush_join_list(msk);
- ssk = mptcp_subflow_get_send(msk, &sndbuf);
-
- /* do auto tuning */
- if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
- sndbuf > READ_ONCE(sk->sk_sndbuf))
- WRITE_ONCE(sk->sk_sndbuf, sndbuf);
+ ssk = mptcp_subflow_get_send(msk);
/* try to keep the subflow socket lock across
* consecutive xmit on the same socket
@@ -1527,7 +1509,9 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_sendmsg_info info;
struct mptcp_data_frag *dfrag;
+ struct sock *xmit_ssk;
int len, copied = 0;
+ bool first = true;
info.flags = 0;
while ((dfrag = mptcp_send_head(sk))) {
@@ -1537,10 +1521,17 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
while (len > 0) {
int ret = 0;
- /* do auto tuning */
- if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
- ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf))
- WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
+ /* the caller already invoked the packet scheduler,
+ * check for a different subflow usage only after
+ * spooling the first chunk of data
+ */
+ xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk));
+ if (!xmit_ssk)
+ goto out;
+ if (xmit_ssk != ssk) {
+ mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk));
+ goto out;
+ }
if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) {
__mptcp_update_wmem(sk);
@@ -1560,6 +1551,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
msk->tx_pending_data -= ret;
copied += ret;
len -= ret;
+ first = false;
}
WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
}
@@ -1579,6 +1571,15 @@ out:
}
}
+static void mptcp_set_nospace(struct sock *sk)
+{
+ /* enable autotune */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+ /* will be cleared on avail space */
+ set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags);
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1680,7 +1681,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
continue;
wait_for_memory:
- set_bit(MPTCP_NOSPACE, &msk->flags);
+ mptcp_set_nospace(sk);
mptcp_push_pending(sk, msg->msg_flags);
ret = sk_stream_wait_memory(sk, &timeo);
if (ret)
@@ -2116,9 +2117,6 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow)
{
- bool dispose_socket = false;
- struct socket *sock;
-
list_del(&subflow->node);
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
@@ -2126,11 +2124,8 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
/* if we are invoked by the msk cleanup code, the subflow is
* already orphaned
*/
- sock = ssk->sk_socket;
- if (sock) {
- dispose_socket = sock != sk->sk_socket;
+ if (ssk->sk_socket)
sock_orphan(ssk);
- }
subflow->disposable = 1;
@@ -2148,8 +2143,6 @@ void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
__sock_put(ssk);
}
release_sock(ssk);
- if (dispose_socket)
- iput(SOCK_INODE(sock));
sock_put(ssk);
}
@@ -2194,6 +2187,8 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow, *tmp;
+ might_sleep();
+
list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -2536,6 +2531,14 @@ static void __mptcp_destroy_sock(struct sock *sk)
pr_debug("msk=%p", msk);
+ might_sleep();
+
+ /* dispose the ancillatory tcp socket, if any */
+ if (msk->subflow) {
+ iput(SOCK_INODE(msk->subflow));
+ msk->subflow = NULL;
+ }
+
/* be sure to always acquire the join list lock, to sync vs
* mptcp_finish_join().
*/
@@ -2586,20 +2589,10 @@ cleanup:
inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- bool slow, dispose_socket;
- struct socket *sock;
+ bool slow = lock_sock_fast(ssk);
- slow = lock_sock_fast(ssk);
- sock = ssk->sk_socket;
- dispose_socket = sock && sock != sk->sk_socket;
sock_orphan(ssk);
unlock_sock_fast(ssk, slow);
-
- /* for the outgoing subflows we additionally need to free
- * the associated socket
- */
- if (dispose_socket)
- iput(SOCK_INODE(sock));
}
sock_orphan(sk);
@@ -2928,10 +2921,16 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
if (!mptcp_send_head(sk))
return;
- if (!sock_owned_by_user(sk))
- __mptcp_subflow_push_pending(sk, ssk);
- else
+ if (!sock_owned_by_user(sk)) {
+ struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk));
+
+ if (xmit_ssk == ssk)
+ __mptcp_subflow_push_pending(sk, ssk);
+ else if (xmit_ssk)
+ mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk));
+ } else {
set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
+ }
}
#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
@@ -2979,6 +2978,20 @@ static void mptcp_release_cb(struct sock *sk)
}
}
+void mptcp_subflow_process_delegated(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ struct sock *sk = subflow->conn;
+
+ mptcp_data_lock(sk);
+ if (!sock_owned_by_user(sk))
+ __mptcp_subflow_push_pending(sk, ssk);
+ else
+ set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
+ mptcp_data_unlock(sk);
+ mptcp_subflow_delegated_done(subflow);
+}
+
static int mptcp_hash(struct sock *sk)
{
/* should never be called,
@@ -3041,7 +3054,7 @@ void mptcp_finish_connect(struct sock *ssk)
mptcp_rcv_space_init(msk, ssk);
}
-static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
+void mptcp_sock_graft(struct sock *sk, struct socket *parent)
{
write_lock_bh(&sk->sk_callback_lock);
rcu_assign_pointer(sk->sk_wq, &parent->wq);
@@ -3284,6 +3297,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
mptcp_copy_inaddrs(newsk, msk->first);
mptcp_rcv_space_init(msk, msk->first);
+ mptcp_propagate_sndbuf(newsk, msk->first);
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack.
@@ -3324,7 +3338,7 @@ static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
if (sk_stream_is_writeable(sk))
return EPOLLOUT | EPOLLWRNORM;
- set_bit(MPTCP_NOSPACE, &msk->flags);
+ mptcp_set_nospace(sk);
smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */
if (sk_stream_is_writeable(sk))
return EPOLLOUT | EPOLLWRNORM;
@@ -3388,13 +3402,58 @@ static struct inet_protosw mptcp_protosw = {
.flags = INET_PROTOSW_ICSK,
};
+static int mptcp_napi_poll(struct napi_struct *napi, int budget)
+{
+ struct mptcp_delegated_action *delegated;
+ struct mptcp_subflow_context *subflow;
+ int work_done = 0;
+
+ delegated = container_of(napi, struct mptcp_delegated_action, napi);
+ while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ bh_lock_sock_nested(ssk);
+ if (!sock_owned_by_user(ssk) &&
+ mptcp_subflow_has_delegated_action(subflow))
+ mptcp_subflow_process_delegated(ssk);
+ /* ... elsewhere tcp_release_cb_override already processed
+ * the action or will do at next release_sock().
+ * In both case must dequeue the subflow here - on the same
+ * CPU that scheduled it.
+ */
+ bh_unlock_sock(ssk);
+ sock_put(ssk);
+
+ if (++work_done == budget)
+ return budget;
+ }
+
+ /* always provide a 0 'work_done' argument, so that napi_complete_done
+ * will not try accessing the NULL napi->dev ptr
+ */
+ napi_complete_done(napi, 0);
+ return work_done;
+}
+
void __init mptcp_proto_init(void)
{
+ struct mptcp_delegated_action *delegated;
+ int cpu;
+
mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
panic("Failed to allocate MPTCP pcpu counter\n");
+ init_dummy_netdev(&mptcp_napi_dev);
+ for_each_possible_cpu(cpu) {
+ delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu);
+ INIT_LIST_HEAD(&delegated->head);
+ netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll,
+ NAPI_POLL_WEIGHT);
+ napi_enable(&delegated->napi);
+ }
+
mptcp_subflow_init();
mptcp_pm_init();
mptcp_token_init();
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index d67de793d363..73a923d02aad 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -24,6 +24,7 @@
#define OPTION_MPTCP_ADD_ADDR6 BIT(7)
#define OPTION_MPTCP_RM_ADDR BIT(8)
#define OPTION_MPTCP_FASTCLOSE BIT(9)
+#define OPTION_MPTCP_PRIO BIT(10)
/* MPTCP option subtypes */
#define MPTCPOPT_MP_CAPABLE 0
@@ -59,6 +60,8 @@
#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 24
#define TCPOLEN_MPTCP_PORT_LEN 4
#define TCPOLEN_MPTCP_RM_ADDR_BASE 4
+#define TCPOLEN_MPTCP_PRIO 3
+#define TCPOLEN_MPTCP_PRIO_ALIGN 4
#define TCPOLEN_MPTCP_FASTCLOSE 12
/* MPTCP MP_JOIN flags */
@@ -86,6 +89,9 @@
#define MPTCP_ADDR_IPVERSION_4 4
#define MPTCP_ADDR_IPVERSION_6 6
+/* MPTCP MP_PRIO flags */
+#define MPTCP_PRIO_BKUP BIT(0)
+
/* MPTCP socket flags */
#define MPTCP_DATA_READY 0
#define MPTCP_NOSPACE 1
@@ -116,6 +122,7 @@ struct mptcp_options_received {
dss : 1,
add_addr : 1,
rm_addr : 1,
+ mp_prio : 1,
family : 4,
echo : 1,
backup : 1;
@@ -196,10 +203,6 @@ struct mptcp_pm_data {
u8 add_addr_accepted;
u8 local_addr_used;
u8 subflows;
- u8 add_addr_signal_max;
- u8 add_addr_accept_max;
- u8 local_addr_max;
- u8 subflows_max;
u8 status;
u8 rm_id;
};
@@ -285,6 +288,11 @@ struct mptcp_sock {
#define mptcp_for_each_subflow(__msk, __subflow) \
list_for_each_entry(__subflow, &((__msk)->conn_list), node)
+static inline void msk_owned_by_me(const struct mptcp_sock *msk)
+{
+ sock_owned_by_me((const struct sock *)msk);
+}
+
static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
{
return (struct mptcp_sock *)sk;
@@ -372,6 +380,15 @@ enum mptcp_data_avail {
MPTCP_SUBFLOW_OOO_DATA
};
+struct mptcp_delegated_action {
+ struct napi_struct napi;
+ struct list_head head;
+};
+
+DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
+
+#define MPTCP_DELEGATE_SEND 0
+
/* MPTCP subflow context */
struct mptcp_subflow_context {
struct list_head node;/* conn_list of subflows */
@@ -396,6 +413,7 @@ struct mptcp_subflow_context {
map_valid : 1,
mpc_map : 1,
backup : 1,
+ send_mp_prio : 1,
rx_eof : 1,
can_ack : 1, /* only after processing the remote a key */
disposable : 1; /* ctx can be free at ulp release time */
@@ -408,6 +426,9 @@ struct mptcp_subflow_context {
u8 local_id;
u8 remote_id;
+ long delegated_status;
+ struct list_head delegated_node; /* link into delegated_action, protected by local BH */
+
struct sock *tcp_sock; /* tcp sk backpointer */
struct sock *conn; /* parent mptcp_sock */
const struct inet_connection_sock_af_ops *icsk_af_ops;
@@ -456,6 +477,61 @@ static inline void mptcp_add_pending_subflow(struct mptcp_sock *msk,
spin_unlock_bh(&msk->join_list_lock);
}
+void mptcp_subflow_process_delegated(struct sock *ssk);
+
+static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow)
+{
+ struct mptcp_delegated_action *delegated;
+ bool schedule;
+
+ /* The implied barrier pairs with mptcp_subflow_delegated_done(), and
+ * ensures the below list check sees list updates done prior to status
+ * bit changes
+ */
+ if (!test_and_set_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) {
+ /* still on delegated list from previous scheduling */
+ if (!list_empty(&subflow->delegated_node))
+ return;
+
+ /* the caller held the subflow bh socket lock */
+ lockdep_assert_in_softirq();
+
+ delegated = this_cpu_ptr(&mptcp_delegated_actions);
+ schedule = list_empty(&delegated->head);
+ list_add_tail(&subflow->delegated_node, &delegated->head);
+ sock_hold(mptcp_subflow_tcp_sock(subflow));
+ if (schedule)
+ napi_schedule(&delegated->napi);
+ }
+}
+
+static inline struct mptcp_subflow_context *
+mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated)
+{
+ struct mptcp_subflow_context *ret;
+
+ if (list_empty(&delegated->head))
+ return NULL;
+
+ ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node);
+ list_del_init(&ret->delegated_node);
+ return ret;
+}
+
+static inline bool mptcp_subflow_has_delegated_action(const struct mptcp_subflow_context *subflow)
+{
+ return test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status);
+}
+
+static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *subflow)
+{
+ /* pairs with mptcp_subflow_delegate, ensures delegate_node is updated before
+ * touching the status bit
+ */
+ smp_wmb();
+ clear_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status);
+}
+
int mptcp_is_enabled(struct net *net);
unsigned int mptcp_get_add_addr_timeout(struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
@@ -466,11 +542,16 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how);
void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
struct mptcp_subflow_context *subflow);
void mptcp_subflow_reset(struct sock *ssk);
+void mptcp_sock_graft(struct sock *sk, struct socket *parent);
+struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk);
/* called with sk socket lock held */
int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
+void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
+ struct sockaddr_storage *addr,
+ unsigned short family);
static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
struct mptcp_subflow_context *ctx)
@@ -514,6 +595,25 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
}
+static inline bool mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
+{
+ if ((sk->sk_userlocks & SOCK_SNDBUF_LOCK) || ssk->sk_sndbuf <= READ_ONCE(sk->sk_sndbuf))
+ return false;
+
+ WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
+ return true;
+}
+
+static inline void mptcp_write_space(struct sock *sk)
+{
+ if (sk_stream_is_writeable(sk)) {
+ /* pairs with memory barrier in mptcp_poll */
+ smp_mb();
+ if (test_and_clear_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags))
+ sk_stream_write_space(sk);
+ }
+}
+
void mptcp_destroy_common(struct mptcp_sock *msk);
void __init mptcp_token_init(void);
@@ -550,7 +650,12 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr);
void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk);
void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id);
+void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup);
+int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk,
+ struct mptcp_addr_info *addr,
+ u8 bkup);
void mptcp_pm_free_anno_list(struct mptcp_sock *msk);
+bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk);
struct mptcp_pm_add_entry *
mptcp_pm_del_add_timer(struct mptcp_sock *msk,
struct mptcp_addr_info *addr);
@@ -615,6 +720,9 @@ void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk);
void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk);
void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id);
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
+unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk);
+unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk);
static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
{
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 278cbe3e539e..280da418d60b 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -18,12 +18,15 @@
#include <net/tcp.h>
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
#include <net/ip6_route.h>
+#include <net/transp_v6.h>
#endif
#include <net/mptcp.h>
#include <uapi/linux/mptcp.h>
#include "protocol.h"
#include "mib.h"
+static void mptcp_subflow_ops_undo_override(struct sock *ssk);
+
static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
enum linux_mptcp_mib_field field)
{
@@ -61,11 +64,23 @@ static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
}
/* validate received token and create truncated hmac and nonce for SYN-ACK */
-static struct mptcp_sock *subflow_token_join_request(struct request_sock *req,
- const struct sk_buff *skb)
+static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req)
{
- struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ struct mptcp_sock *msk = subflow_req->msk;
u8 hmac[SHA256_DIGEST_SIZE];
+
+ get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
+
+ subflow_generate_hmac(msk->local_key, msk->remote_key,
+ subflow_req->local_nonce,
+ subflow_req->remote_nonce, hmac);
+
+ subflow_req->thmac = get_unaligned_be64(hmac);
+}
+
+static struct mptcp_sock *subflow_token_join_request(struct request_sock *req)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
struct mptcp_sock *msk;
int local_id;
@@ -82,13 +97,6 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req,
}
subflow_req->local_id = local_id;
- get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
-
- subflow_generate_hmac(msk->local_key, msk->remote_key,
- subflow_req->local_nonce,
- subflow_req->remote_nonce, hmac);
-
- subflow_req->thmac = get_unaligned_be64(hmac);
return msk;
}
@@ -112,6 +120,11 @@ static int __subflow_init_req(struct request_sock *req, const struct sock *sk_li
return 0;
}
+static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk)
+{
+ return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport;
+}
+
/* Init mptcp request socket.
*
* Returns an error code if a JOIN has failed and a TCP reset
@@ -178,12 +191,30 @@ again:
subflow_req->remote_id = mp_opt.join_id;
subflow_req->token = mp_opt.token;
subflow_req->remote_nonce = mp_opt.nonce;
- subflow_req->msk = subflow_token_join_request(req, skb);
+ subflow_req->msk = subflow_token_join_request(req);
/* Can't fall back to TCP in this case. */
if (!subflow_req->msk)
return -EPERM;
+ if (subflow_use_different_sport(subflow_req->msk, sk_listener)) {
+ pr_debug("syn inet_sport=%d %d",
+ ntohs(inet_sk(sk_listener)->inet_sport),
+ ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
+ if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
+ sock_put((struct sock *)subflow_req->msk);
+ mptcp_token_destroy_request(req);
+ tcp_request_sock_ops.destructor(req);
+ subflow_req->msk = NULL;
+ subflow_req->mp_join = 0;
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
+ return -EPERM;
+ }
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX);
+ }
+
+ subflow_req_create_thmac(subflow_req);
+
if (unlikely(req->syncookie)) {
if (mptcp_can_accept_new_subflow(subflow_req->msk))
subflow_init_req_cookie_join_save(subflow_req, skb);
@@ -326,6 +357,11 @@ void mptcp_subflow_reset(struct sock *ssk)
sock_put(sk);
}
+static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk)
+{
+ return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport;
+}
+
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -343,6 +379,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
if (subflow->conn_finished)
return;
+ mptcp_propagate_sndbuf(parent, sk);
subflow->rel_write_seq = 1;
subflow->conn_finished = 1;
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
@@ -391,6 +428,13 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
subflow->mp_join = 1;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
+
+ if (subflow_use_different_dport(mptcp_sk(parent), sk)) {
+ pr_debug("synack inet_dport=%d %d",
+ ntohs(inet_sk(sk)->inet_dport),
+ ntohs(inet_sk(parent)->inet_dport));
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX);
+ }
} else if (mptcp_check_fallback(sk)) {
fallback:
mptcp_rcv_space_init(mptcp_sk(parent), sk);
@@ -427,6 +471,7 @@ drop:
static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
static struct inet_connection_sock_af_ops subflow_v6_specific;
static struct inet_connection_sock_af_ops subflow_v6m_specific;
+static struct proto tcpv6_prot_override;
static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
@@ -508,6 +553,8 @@ static void subflow_ulp_fallback(struct sock *sk,
icsk->icsk_ulp_ops = NULL;
rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
tcp_sk(sk)->is_mptcp = 0;
+
+ mptcp_subflow_ops_undo_override(sk);
}
static void subflow_drop_ctx(struct sock *ssk)
@@ -653,6 +700,17 @@ create_child:
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
tcp_rsk(req)->drop_req = true;
+
+ if (subflow_use_different_sport(owner, sk)) {
+ pr_debug("ack inet_sport=%d %d",
+ ntohs(inet_sk(sk)->inet_sport),
+ ntohs(inet_sk((struct sock *)owner)->inet_sport));
+ if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX);
+ goto out;
+ }
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX);
+ }
}
}
@@ -681,6 +739,7 @@ dispose_child:
}
static struct inet_connection_sock_af_ops subflow_specific;
+static struct proto tcp_prot_override;
enum mapping_status {
MAPPING_OK,
@@ -1040,7 +1099,10 @@ static void subflow_data_ready(struct sock *sk)
static void subflow_write_space(struct sock *ssk)
{
- /* we take action in __mptcp_clean_una() */
+ struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
+
+ mptcp_propagate_sndbuf(sk, ssk);
+ mptcp_write_space(sk);
}
static struct inet_connection_sock_af_ops *
@@ -1073,22 +1135,32 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
}
#endif
-static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
- struct sockaddr_storage *addr)
+void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
+ struct sockaddr_storage *addr,
+ unsigned short family)
{
memset(addr, 0, sizeof(*addr));
- addr->ss_family = info->family;
+ addr->ss_family = family;
if (addr->ss_family == AF_INET) {
struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
- in_addr->sin_addr = info->addr;
+ if (info->family == AF_INET)
+ in_addr->sin_addr = info->addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (ipv6_addr_v4mapped(&info->addr6))
+ in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3];
+#endif
in_addr->sin_port = info->port;
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
else if (addr->ss_family == AF_INET6) {
struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
- in6_addr->sin6_addr = info->addr6;
+ if (info->family == AF_INET)
+ ipv6_addr_set_v4mapped(info->addr.s_addr,
+ &in6_addr->sin6_addr);
+ else
+ in6_addr->sin6_addr = info->addr6;
in6_addr->sin6_port = info->port;
}
#endif
@@ -1132,11 +1204,11 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
subflow->remote_key = msk->remote_key;
subflow->local_key = msk->local_key;
subflow->token = msk->token;
- mptcp_info2sockaddr(loc, &addr);
+ mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
addrlen = sizeof(struct sockaddr_in);
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- if (loc->family == AF_INET6)
+ if (addr.ss_family == AF_INET6)
addrlen = sizeof(struct sockaddr_in6);
#endif
ssk->sk_bound_dev_if = loc->ifindex;
@@ -1152,13 +1224,16 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
subflow->remote_id = remote_id;
subflow->request_join = 1;
subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
- mptcp_info2sockaddr(remote, &addr);
+ mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
mptcp_add_pending_subflow(msk, subflow);
err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
if (err && err != -EINPROGRESS)
goto failed_unlink;
+ /* discard the subflow socket */
+ mptcp_sock_graft(ssk, sk->sk_socket);
+ iput(SOCK_INODE(sf));
return err;
failed_unlink:
@@ -1196,6 +1271,25 @@ static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
#endif /* CONFIG_SOCK_CGROUP_DATA */
}
+static void mptcp_subflow_ops_override(struct sock *ssk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (ssk->sk_prot == &tcpv6_prot)
+ ssk->sk_prot = &tcpv6_prot_override;
+ else
+#endif
+ ssk->sk_prot = &tcp_prot_override;
+}
+
+static void mptcp_subflow_ops_undo_override(struct sock *ssk)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (ssk->sk_prot == &tcpv6_prot_override)
+ ssk->sk_prot = &tcpv6_prot;
+ else
+#endif
+ ssk->sk_prot = &tcp_prot;
+}
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
{
struct mptcp_subflow_context *subflow;
@@ -1251,6 +1345,7 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
*new_sock = sf;
sock_hold(sk);
subflow->conn = sk;
+ mptcp_subflow_ops_override(sf->sk);
return 0;
}
@@ -1267,6 +1362,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
INIT_LIST_HEAD(&ctx->node);
+ INIT_LIST_HEAD(&ctx->delegated_node);
pr_debug("subflow=%p", ctx);
@@ -1299,6 +1395,7 @@ static void subflow_state_change(struct sock *sk)
__subflow_state_change(sk);
if (subflow_simultaneous_connect(sk)) {
+ mptcp_propagate_sndbuf(parent, sk);
mptcp_do_fallback(sk);
mptcp_rcv_space_init(mptcp_sk(parent), sk);
pr_fallback(mptcp_sk(parent));
@@ -1378,6 +1475,7 @@ static void subflow_ulp_release(struct sock *ssk)
sock_put(sk);
}
+ mptcp_subflow_ops_undo_override(ssk);
if (release)
kfree_rcu(ctx, rcu);
}
@@ -1431,6 +1529,16 @@ static void subflow_ulp_clone(const struct request_sock *req,
}
}
+static void tcp_release_cb_override(struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+
+ if (mptcp_subflow_has_delegated_action(subflow))
+ mptcp_subflow_process_delegated(ssk);
+
+ tcp_release_cb(ssk);
+}
+
static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
.name = "mptcp",
.owner = THIS_MODULE,
@@ -1471,6 +1579,9 @@ void __init mptcp_subflow_init(void)
subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
subflow_specific.sk_rx_dst_set = subflow_finish_connect;
+ tcp_prot_override = tcp_prot;
+ tcp_prot_override.release_cb = tcp_release_cb_override;
+
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
@@ -1486,6 +1597,9 @@ void __init mptcp_subflow_init(void)
subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
subflow_v6m_specific.net_frag_header_len = 0;
+
+ tcpv6_prot_override = tcpv6_prot;
+ tcpv6_prot_override.release_cb = tcp_release_cb_override;
#endif
mptcp_diag_subflow_init(&subflow_ulp_ops);