summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/9p/trans_xen.c8
-rw-r--r--net/bridge/br_netlink.c7
-rw-r--r--net/bridge/br_stp_if.c1
-rw-r--r--net/bridge/br_stp_timer.c2
-rw-r--r--net/bridge/netfilter/ebt_arpreply.c3
-rw-r--r--net/bridge/netfilter/ebtables.c9
-rw-r--r--net/ceph/ceph_common.c29
-rw-r--r--net/ceph/cls_lock_client.c51
-rw-r--r--net/ceph/debugfs.c7
-rw-r--r--net/ceph/messenger.c6
-rw-r--r--net/ceph/osd_client.c143
-rw-r--r--net/ceph/pagelist.c2
-rw-r--r--net/ceph/snapshot.c6
-rw-r--r--net/core/dev.c88
-rw-r--r--net/core/neighbour.c14
-rw-r--r--net/core/rtnetlink.c88
-rw-r--r--net/core/sock.c30
-rw-r--r--net/dccp/ipv4.c2
-rw-r--r--net/dccp/ipv6.c8
-rw-r--r--net/decnet/af_decnet.c3
-rw-r--r--net/decnet/dn_neigh.c12
-rw-r--r--net/ipv4/arp.c48
-rw-r--r--net/ipv4/esp4.c5
-rw-r--r--net/ipv4/fib_frontend.c15
-rw-r--r--net/ipv4/fib_trie.c26
-rw-r--r--net/ipv4/inet_connection_sock.c2
-rw-r--r--net/ipv4/inet_hashtables.c6
-rw-r--r--net/ipv4/ipmr.c18
-rw-r--r--net/ipv4/route.c18
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_input.c11
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_metrics.c5
-rw-r--r--net/ipv4/udp.c4
-rw-r--r--net/ipv4/udp_impl.h1
-rw-r--r--net/ipv6/addrconf.c6
-rw-r--r--net/ipv6/ila/ila_xlat.c8
-rw-r--r--net/ipv6/ip6_offload.c7
-rw-r--r--net/ipv6/ip6_output.c20
-rw-r--r--net/ipv6/output_core.c14
-rw-r--r--net/ipv6/route.c13
-rw-r--r--net/ipv6/tcp_ipv6.c4
-rw-r--r--net/ipv6/udp.c4
-rw-r--r--net/ipv6/udp_impl.h1
-rw-r--r--net/ipv6/udp_offload.c6
-rw-r--r--net/key/af_key.c2
-rw-r--r--net/llc/af_llc.c2
-rw-r--r--net/llc/llc_conn.c4
-rw-r--r--net/llc/llc_sap.c2
-rw-r--r--net/mpls/af_mpls.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c19
-rw-r--r--net/netfilter/nf_conntrack_core.c8
-rw-r--r--net/netfilter/nf_conntrack_helper.c12
-rw-r--r--net/netfilter/nf_conntrack_netlink.c11
-rw-r--r--net/netfilter/nf_nat_core.c4
-rw-r--r--net/netfilter/nf_tables_api.c160
-rw-r--r--net/netfilter/nfnetlink_cthelper.c17
-rw-r--r--net/netfilter/nft_bitwise.c19
-rw-r--r--net/netfilter/nft_cmp.c12
-rw-r--r--net/netfilter/nft_ct.c4
-rw-r--r--net/netfilter/nft_immediate.c5
-rw-r--r--net/netfilter/nft_range.c4
-rw-r--r--net/netfilter/nft_set_hash.c2
-rw-r--r--net/netfilter/x_tables.c48
-rw-r--r--net/netfilter/xt_CT.c6
-rw-r--r--net/netfilter/xt_recent.c5
-rw-r--r--net/openvswitch/conntrack.c4
-rw-r--r--net/packet/af_packet.c14
-rw-r--r--net/sched/cls_matchall.c1
-rw-r--r--net/sched/sch_api.c6
-rw-r--r--net/sched/sch_choke.c5
-rw-r--r--net/sched/sch_fq.c12
-rw-r--r--net/sched/sch_fq_codel.c26
-rw-r--r--net/sched/sch_hhf.c33
-rw-r--r--net/sched/sch_netem.c6
-rw-r--r--net/sched/sch_sfq.c6
-rw-r--r--net/sctp/associola.c4
-rw-r--r--net/sctp/ipv6.c49
-rw-r--r--net/sctp/sm_make_chunk.c13
-rw-r--r--net/sctp/sm_statefuns.c3
-rw-r--r--net/smc/Kconfig4
-rw-r--r--net/smc/af_smc.c2
-rw-r--r--net/smc/smc_clc.c4
-rw-r--r--net/smc/smc_core.c16
-rw-r--r--net/smc/smc_core.h2
-rw-r--r--net/smc/smc_ib.c21
-rw-r--r--net/smc/smc_ib.h2
-rw-r--r--net/sunrpc/Kconfig1
-rw-r--r--net/sunrpc/clnt.c8
-rw-r--r--net/sunrpc/sched.c5
-rw-r--r--net/sunrpc/svc.c134
-rw-r--r--net/sunrpc/xdr.c2
-rw-r--r--net/sunrpc/xprt.c1
-rw-r--r--net/sunrpc/xprtrdma/Makefile2
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c12
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c8
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c71
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c89
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c79
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c512
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c978
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c110
-rw-r--r--net/sunrpc/xprtrdma/transport.c57
-rw-r--r--net/sunrpc/xprtrdma/verbs.c323
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h22
-rw-r--r--net/sysctl_net.c1
-rw-r--r--net/tipc/socket.c38
-rw-r--r--net/vmw_vsock/af_vsock.c21
-rw-r--r--net/vmw_vsock/virtio_transport.c6
-rw-r--r--net/x25/af_x25.c24
-rw-r--r--net/x25/sysctl_net_x25.c5
-rw-r--r--net/xfrm/xfrm_device.c2
-rw-r--r--net/xfrm/xfrm_policy.c47
-rw-r--r--net/xfrm/xfrm_state.c2
114 files changed, 2400 insertions, 1511 deletions
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index 71e85643b3f9..6ad3e043c617 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -454,8 +454,8 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev,
goto error_xenbus;
}
priv->tag = xenbus_read(xbt, dev->nodename, "tag", NULL);
- if (!priv->tag) {
- ret = -EINVAL;
+ if (IS_ERR(priv->tag)) {
+ ret = PTR_ERR(priv->tag);
goto error_xenbus;
}
ret = xenbus_transaction_end(xbt, 0);
@@ -525,7 +525,7 @@ static struct xenbus_driver xen_9pfs_front_driver = {
.otherend_changed = xen_9pfs_front_changed,
};
-int p9_trans_xen_init(void)
+static int p9_trans_xen_init(void)
{
if (!xen_domain())
return -ENODEV;
@@ -537,7 +537,7 @@ int p9_trans_xen_init(void)
}
module_init(p9_trans_xen_init);
-void p9_trans_xen_exit(void)
+static void p9_trans_xen_exit(void)
{
v9fs_unregister_trans(&p9_xen_trans);
return xenbus_unregister_driver(&xen_9pfs_front_driver);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index c5ce7745b230..574f78824d8a 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -835,6 +835,13 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[])
return -EPROTONOSUPPORT;
}
}
+
+ if (data[IFLA_BR_VLAN_DEFAULT_PVID]) {
+ __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]);
+
+ if (defpvid >= VLAN_VID_MASK)
+ return -EINVAL;
+ }
#endif
return 0;
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 08341d2aa9c9..0db8102995a5 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -179,6 +179,7 @@ static void br_stp_start(struct net_bridge *br)
br_debug(br, "using kernel STP\n");
/* To start timers on any ports left in blocking */
+ mod_timer(&br->hello_timer, jiffies + br->hello_time);
br_port_state_selection(br);
}
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index c98b3e5c140a..60b6fe277a8b 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -40,7 +40,7 @@ static void br_hello_timer_expired(unsigned long arg)
if (br->dev->flags & IFF_UP) {
br_config_bpdu_generation(br);
- if (br->stp_enabled != BR_USER_STP)
+ if (br->stp_enabled == BR_KERNEL_STP)
mod_timer(&br->hello_timer,
round_jiffies(jiffies + br->hello_time));
}
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index 5929309beaa1..db85230e49c3 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -68,6 +68,9 @@ static int ebt_arpreply_tg_check(const struct xt_tgchk_param *par)
if (e->ethproto != htons(ETH_P_ARP) ||
e->invflags & EBT_IPROTO)
return -EINVAL;
+ if (ebt_invalid_target(info->target))
+ return -EINVAL;
+
return 0;
}
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 9ec0c9f908fa..9c6e619f452b 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1373,7 +1373,8 @@ static inline int ebt_obj_to_user(char __user *um, const char *_name,
strlcpy(name, _name, sizeof(name));
if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) ||
put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) ||
- xt_data_to_user(um + entrysize, data, usersize, datasize))
+ xt_data_to_user(um + entrysize, data, usersize, datasize,
+ XT_ALIGN(datasize)))
return -EFAULT;
return 0;
@@ -1658,7 +1659,8 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr,
if (match->compat_to_user(cm->data, m->data))
return -EFAULT;
} else {
- if (xt_data_to_user(cm->data, m->data, match->usersize, msize))
+ if (xt_data_to_user(cm->data, m->data, match->usersize, msize,
+ COMPAT_XT_ALIGN(msize)))
return -EFAULT;
}
@@ -1687,7 +1689,8 @@ static int compat_target_to_user(struct ebt_entry_target *t,
if (target->compat_to_user(cm->data, t->data))
return -EFAULT;
} else {
- if (xt_data_to_user(cm->data, t->data, target->usersize, tsize))
+ if (xt_data_to_user(cm->data, t->data, target->usersize, tsize,
+ COMPAT_XT_ALIGN(tsize)))
return -EFAULT;
}
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 108533859a53..4fd02831beed 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -45,6 +45,17 @@ bool libceph_compatible(void *data)
}
EXPORT_SYMBOL(libceph_compatible);
+static int param_get_supported_features(char *buffer,
+ const struct kernel_param *kp)
+{
+ return sprintf(buffer, "0x%llx", CEPH_FEATURES_SUPPORTED_DEFAULT);
+}
+static const struct kernel_param_ops param_ops_supported_features = {
+ .get = param_get_supported_features,
+};
+module_param_cb(supported_features, &param_ops_supported_features, NULL,
+ S_IRUGO);
+
/*
* find filename portion of a path (/foo/bar/baz -> baz)
*/
@@ -187,7 +198,7 @@ void *ceph_kvmalloc(size_t size, gfp_t flags)
return ptr;
}
- return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
+ return __vmalloc(size, flags, PAGE_KERNEL);
}
@@ -596,9 +607,7 @@ EXPORT_SYMBOL(ceph_client_gid);
/*
* create a fresh client instance
*/
-struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
- u64 supported_features,
- u64 required_features)
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
{
struct ceph_client *client;
struct ceph_entity_addr *myaddr = NULL;
@@ -615,14 +624,12 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
init_waitqueue_head(&client->auth_wq);
client->auth_err = 0;
- if (!ceph_test_opt(client, NOMSGAUTH))
- required_features |= CEPH_FEATURE_MSG_AUTH;
-
client->extra_mon_dispatch = NULL;
- client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
- supported_features;
- client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT |
- required_features;
+ client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+ client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT;
+
+ if (!ceph_test_opt(client, NOMSGAUTH))
+ client->required_features |= CEPH_FEATURE_MSG_AUTH;
/* msgr */
if (ceph_test_opt(client, MYIP))
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index b9233b990399..08ada893f01e 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -179,6 +179,57 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
}
EXPORT_SYMBOL(ceph_cls_break_lock);
+int ceph_cls_set_cookie(struct ceph_osd_client *osdc,
+ struct ceph_object_id *oid,
+ struct ceph_object_locator *oloc,
+ char *lock_name, u8 type, char *old_cookie,
+ char *tag, char *new_cookie)
+{
+ int cookie_op_buf_size;
+ int name_len = strlen(lock_name);
+ int old_cookie_len = strlen(old_cookie);
+ int tag_len = strlen(tag);
+ int new_cookie_len = strlen(new_cookie);
+ void *p, *end;
+ struct page *cookie_op_page;
+ int ret;
+
+ cookie_op_buf_size = name_len + sizeof(__le32) +
+ old_cookie_len + sizeof(__le32) +
+ tag_len + sizeof(__le32) +
+ new_cookie_len + sizeof(__le32) +
+ sizeof(u8) + CEPH_ENCODING_START_BLK_LEN;
+ if (cookie_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ cookie_op_page = alloc_page(GFP_NOIO);
+ if (!cookie_op_page)
+ return -ENOMEM;
+
+ p = page_address(cookie_op_page);
+ end = p + cookie_op_buf_size;
+
+ /* encode cls_lock_set_cookie_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ cookie_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+ ceph_encode_8(&p, type);
+ ceph_encode_string(&p, end, old_cookie, old_cookie_len);
+ ceph_encode_string(&p, end, tag, tag_len);
+ ceph_encode_string(&p, end, new_cookie, new_cookie_len);
+
+ dout("%s lock_name %s type %d old_cookie %s tag %s new_cookie %s\n",
+ __func__, lock_name, type, old_cookie, tag, new_cookie);
+ ret = ceph_osdc_call(osdc, oid, oloc, "lock", "set_cookie",
+ CEPH_OSD_FLAG_WRITE, cookie_op_page,
+ cookie_op_buf_size, NULL, NULL);
+
+ dout("%s: status %d\n", __func__, ret);
+ __free_page(cookie_op_page);
+ return ret;
+}
+EXPORT_SYMBOL(ceph_cls_set_cookie);
+
void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers)
{
int i;
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index c62b2b029a6e..71ba13927b3d 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -62,7 +62,8 @@ static int osdmap_show(struct seq_file *s, void *p)
return 0;
down_read(&osdc->lock);
- seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
+ seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch,
+ osdc->epoch_barrier, map->flags);
for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
struct ceph_pg_pool_info *pi =
@@ -177,9 +178,7 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
seq_printf(s, "%llu\t", req->r_tid);
dump_target(s, &req->r_t);
- seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
- le32_to_cpu(req->r_replay_version.epoch),
- le64_to_cpu(req->r_replay_version.version));
+ seq_printf(s, "\t%d", req->r_attempts);
for (i = 0; i < req->r_num_ops; i++) {
struct ceph_osd_req_op *op = &req->r_ops[i];
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index f76bb3332613..5766a6c896c4 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1386,8 +1386,9 @@ static void prepare_write_keepalive(struct ceph_connection *con)
dout("prepare_write_keepalive %p\n", con);
con_out_kvec_reset(con);
if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
- struct timespec now = CURRENT_TIME;
+ struct timespec now;
+ ktime_get_real_ts(&now);
con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
ceph_encode_timespec(&con->out_temp_keepalive2, &now);
con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
@@ -3176,8 +3177,9 @@ bool ceph_con_keepalive_expired(struct ceph_connection *con,
{
if (interval > 0 &&
(con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
- struct timespec now = CURRENT_TIME;
+ struct timespec now;
struct timespec ts;
+ ktime_get_real_ts(&now);
jiffies_to_timespec(interval, &ts);
ts = timespec_add(con->last_keepalive_ack, ts);
return timespec_compare(&now, &ts) >= 0;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e15ea9e4c495..924f07c36ddb 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -961,6 +961,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
truncate_size, truncate_seq);
}
+ req->r_abort_on_full = true;
req->r_flags = flags;
req->r_base_oloc.pool = layout->pool_id;
req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
@@ -1005,7 +1006,7 @@ static bool osd_registered(struct ceph_osd *osd)
*/
static void osd_init(struct ceph_osd *osd)
{
- atomic_set(&osd->o_ref, 1);
+ refcount_set(&osd->o_ref, 1);
RB_CLEAR_NODE(&osd->o_node);
osd->o_requests = RB_ROOT;
osd->o_linger_requests = RB_ROOT;
@@ -1050,9 +1051,9 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
static struct ceph_osd *get_osd(struct ceph_osd *osd)
{
- if (atomic_inc_not_zero(&osd->o_ref)) {
- dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
- atomic_read(&osd->o_ref));
+ if (refcount_inc_not_zero(&osd->o_ref)) {
+ dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1,
+ refcount_read(&osd->o_ref));
return osd;
} else {
dout("get_osd %p FAIL\n", osd);
@@ -1062,9 +1063,9 @@ static struct ceph_osd *get_osd(struct ceph_osd *osd)
static void put_osd(struct ceph_osd *osd)
{
- dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
- atomic_read(&osd->o_ref) - 1);
- if (atomic_dec_and_test(&osd->o_ref)) {
+ dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref),
+ refcount_read(&osd->o_ref) - 1);
+ if (refcount_dec_and_test(&osd->o_ref)) {
osd_cleanup(osd);
kfree(osd);
}
@@ -1297,8 +1298,9 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
__pool_full(pi);
WARN_ON(pi->id != t->base_oloc.pool);
- return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
- (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+ return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) ||
+ ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) ||
+ (osdc->osdmap->epoch < osdc->epoch_barrier);
}
enum calc_target_result {
@@ -1503,9 +1505,10 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
ceph_encode_32(&p, req->r_flags);
ceph_encode_timespec(p, &req->r_mtime);
p += sizeof(struct ceph_timespec);
- /* aka reassert_version */
- memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
- p += sizeof(req->r_replay_version);
+
+ /* reassert_version */
+ memset(p, 0, sizeof(struct ceph_eversion));
+ p += sizeof(struct ceph_eversion);
/* oloc */
ceph_start_encoding(&p, 5, 4,
@@ -1626,6 +1629,7 @@ static void maybe_request_map(struct ceph_osd_client *osdc)
ceph_monc_renew_subs(&osdc->client->monc);
}
+static void complete_request(struct ceph_osd_request *req, int err);
static void send_map_check(struct ceph_osd_request *req);
static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -1635,6 +1639,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
enum calc_target_result ct_res;
bool need_send = false;
bool promoted = false;
+ bool need_abort = false;
WARN_ON(req->r_tid);
dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
@@ -1650,8 +1655,13 @@ again:
goto promote;
}
- if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
- ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+ if (osdc->osdmap->epoch < osdc->epoch_barrier) {
+ dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch,
+ osdc->epoch_barrier);
+ req->r_t.paused = true;
+ maybe_request_map(osdc);
+ } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
dout("req %p pausewr\n", req);
req->r_t.paused = true;
maybe_request_map(osdc);
@@ -1669,6 +1679,8 @@ again:
pr_warn_ratelimited("FULL or reached pool quota\n");
req->r_t.paused = true;
maybe_request_map(osdc);
+ if (req->r_abort_on_full)
+ need_abort = true;
} else if (!osd_homeless(osd)) {
need_send = true;
} else {
@@ -1685,6 +1697,8 @@ again:
link_request(osd, req);
if (need_send)
send_request(req);
+ else if (need_abort)
+ complete_request(req, -ENOSPC);
mutex_unlock(&osd->lock);
if (ct_res == CALC_TARGET_POOL_DNE)
@@ -1799,6 +1813,97 @@ static void abort_request(struct ceph_osd_request *req, int err)
complete_request(req, err);
}
+static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
+{
+ if (likely(eb > osdc->epoch_barrier)) {
+ dout("updating epoch_barrier from %u to %u\n",
+ osdc->epoch_barrier, eb);
+ osdc->epoch_barrier = eb;
+ /* Request map if we're not to the barrier yet */
+ if (eb > osdc->osdmap->epoch)
+ maybe_request_map(osdc);
+ }
+}
+
+void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
+{
+ down_read(&osdc->lock);
+ if (unlikely(eb > osdc->epoch_barrier)) {
+ up_read(&osdc->lock);
+ down_write(&osdc->lock);
+ update_epoch_barrier(osdc, eb);
+ up_write(&osdc->lock);
+ } else {
+ up_read(&osdc->lock);
+ }
+}
+EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier);
+
+/*
+ * Drop all pending requests that are stalled waiting on a full condition to
+ * clear, and complete them with ENOSPC as the return code. Set the
+ * osdc->epoch_barrier to the latest map epoch that we've seen if any were
+ * cancelled.
+ */
+static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
+{
+ struct rb_node *n;
+ bool victims = false;
+
+ dout("enter abort_on_full\n");
+
+ if (!ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && !have_pool_full(osdc))
+ goto out;
+
+ /* Scan list and see if there is anything to abort */
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+ struct rb_node *m;
+
+ m = rb_first(&osd->o_requests);
+ while (m) {
+ struct ceph_osd_request *req = rb_entry(m,
+ struct ceph_osd_request, r_node);
+ m = rb_next(m);
+
+ if (req->r_abort_on_full) {
+ victims = true;
+ break;
+ }
+ }
+ if (victims)
+ break;
+ }
+
+ if (!victims)
+ goto out;
+
+ /*
+ * Update the barrier to current epoch if it's behind that point,
+ * since we know we have some calls to be aborted in the tree.
+ */
+ update_epoch_barrier(osdc, osdc->osdmap->epoch);
+
+ for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+ struct rb_node *m;
+
+ m = rb_first(&osd->o_requests);
+ while (m) {
+ struct ceph_osd_request *req = rb_entry(m,
+ struct ceph_osd_request, r_node);
+ m = rb_next(m);
+
+ if (req->r_abort_on_full &&
+ (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
+ pool_full(osdc, req->r_t.target_oloc.pool)))
+ abort_request(req, -ENOSPC);
+ }
+ }
+out:
+ dout("return abort_on_full barrier=%u\n", osdc->epoch_barrier);
+}
+
static void check_pool_dne(struct ceph_osd_request *req)
{
struct ceph_osd_client *osdc = req->r_osdc;
@@ -3252,11 +3357,13 @@ done:
pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
have_pool_full(osdc);
- if (was_pauserd || was_pausewr || pauserd || pausewr)
+ if (was_pauserd || was_pausewr || pauserd || pausewr ||
+ osdc->osdmap->epoch < osdc->epoch_barrier)
maybe_request_map(osdc);
kick_requests(osdc, &need_resend, &need_resend_linger);
+ ceph_osdc_abort_on_full(osdc);
ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
osdc->osdmap->epoch);
up_write(&osdc->lock);
@@ -3574,7 +3681,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
ceph_oid_copy(&lreq->t.base_oid, oid);
ceph_oloc_copy(&lreq->t.base_oloc, oloc);
lreq->t.flags = CEPH_OSD_FLAG_WRITE;
- lreq->mtime = CURRENT_TIME;
+ ktime_get_real_ts(&lreq->mtime);
lreq->reg_req = alloc_linger_request(lreq);
if (!lreq->reg_req) {
@@ -3632,7 +3739,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
req->r_flags = CEPH_OSD_FLAG_WRITE;
- req->r_mtime = CURRENT_TIME;
+ ktime_get_real_ts(&req->r_mtime);
osd_req_op_watch_init(req, 0, lreq->linger_id,
CEPH_OSD_WATCH_OP_UNWATCH);
@@ -4126,7 +4233,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
close_osd(osd);
}
up_write(&osdc->lock);
- WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+ WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1);
osd_cleanup(&osdc->homeless_osd);
WARN_ON(!list_empty(&osdc->osd_lru));
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index 6864007e64fc..ce09f73be759 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -16,7 +16,7 @@ static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
void ceph_pagelist_release(struct ceph_pagelist *pl)
{
- if (!atomic_dec_and_test(&pl->refcnt))
+ if (!refcount_dec_and_test(&pl->refcnt))
return;
ceph_pagelist_unmap_tail(pl);
while (!list_empty(&pl->head)) {
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
index 705414e78ae0..e14a5d038656 100644
--- a/net/ceph/snapshot.c
+++ b/net/ceph/snapshot.c
@@ -49,7 +49,7 @@ struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
if (!snapc)
return NULL;
- atomic_set(&snapc->nref, 1);
+ refcount_set(&snapc->nref, 1);
snapc->num_snaps = snap_count;
return snapc;
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(ceph_create_snap_context);
struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc)
{
if (sc)
- atomic_inc(&sc->nref);
+ refcount_inc(&sc->nref);
return sc;
}
EXPORT_SYMBOL(ceph_get_snap_context);
@@ -68,7 +68,7 @@ void ceph_put_snap_context(struct ceph_snap_context *sc)
{
if (!sc)
return;
- if (atomic_dec_and_test(&sc->nref)) {
+ if (refcount_dec_and_test(&sc->nref)) {
/*printk(" deleting snap_context %p\n", sc);*/
kfree(sc);
}
diff --git a/net/core/dev.c b/net/core/dev.c
index d07aa5ffb511..fca407b4a6ea 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -81,6 +81,7 @@
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/mutex.h>
#include <linux/string.h>
#include <linux/mm.h>
@@ -4235,7 +4236,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
int ret;
if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
- unsigned long pflags = current->flags;
+ unsigned int noreclaim_flag;
/*
* PFMEMALLOC skbs are special, they should
@@ -4246,9 +4247,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
* Use PF_MEMALLOC as this saves us from propagating the allocation
* context down to all allocation sites.
*/
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
ret = __netif_receive_skb_core(skb, true);
- current_restore_flags(pflags, PF_MEMALLOC);
+ memalloc_noreclaim_restore(noreclaim_flag);
} else
ret = __netif_receive_skb_core(skb, false);
@@ -6851,6 +6852,32 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
}
EXPORT_SYMBOL(dev_change_proto_down);
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op)
+{
+ struct netdev_xdp xdp;
+
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG;
+
+ /* Query must always succeed. */
+ WARN_ON(xdp_op(dev, &xdp) < 0);
+ return xdp.prog_attached;
+}
+
+static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op,
+ struct netlink_ext_ack *extack,
+ struct bpf_prog *prog)
+{
+ struct netdev_xdp xdp;
+
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_SETUP_PROG;
+ xdp.extack = extack;
+ xdp.prog = prog;
+
+ return xdp_op(dev, &xdp);
+}
+
/**
* dev_change_xdp_fd - set or clear a bpf program for a device rx path
* @dev: device
@@ -6863,41 +6890,34 @@ EXPORT_SYMBOL(dev_change_proto_down);
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
int fd, u32 flags)
{
- int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp);
const struct net_device_ops *ops = dev->netdev_ops;
struct bpf_prog *prog = NULL;
- struct netdev_xdp xdp;
+ xdp_op_t xdp_op, xdp_chk;
int err;
ASSERT_RTNL();
- xdp_op = ops->ndo_xdp;
+ xdp_op = xdp_chk = ops->ndo_xdp;
+ if (!xdp_op && (flags & XDP_FLAGS_DRV_MODE))
+ return -EOPNOTSUPP;
if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
xdp_op = generic_xdp_install;
+ if (xdp_op == xdp_chk)
+ xdp_chk = generic_xdp_install;
if (fd >= 0) {
- if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = XDP_QUERY_PROG;
-
- err = xdp_op(dev, &xdp);
- if (err < 0)
- return err;
- if (xdp.prog_attached)
- return -EBUSY;
- }
+ if (xdp_chk && __dev_xdp_attached(dev, xdp_chk))
+ return -EEXIST;
+ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
+ __dev_xdp_attached(dev, xdp_op))
+ return -EBUSY;
prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = XDP_SETUP_PROG;
- xdp.extack = extack;
- xdp.prog = prog;
-
- err = xdp_op(dev, &xdp);
+ err = dev_xdp_install(dev, xdp_op, extack, prog);
if (err < 0 && prog)
bpf_prog_put(prog);
@@ -7264,12 +7284,10 @@ static int netif_alloc_rx_queues(struct net_device *dev)
BUG_ON(count < 1);
- rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
- if (!rx) {
- rx = vzalloc(sz);
- if (!rx)
- return -ENOMEM;
- }
+ rx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT);
+ if (!rx)
+ return -ENOMEM;
+
dev->_rx = rx;
for (i = 0; i < count; i++)
@@ -7306,12 +7324,10 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
if (count < 1 || count > 0xffff)
return -EINVAL;
- tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
- if (!tx) {
- tx = vzalloc(sz);
- if (!tx)
- return -ENOMEM;
- }
+ tx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT);
+ if (!tx)
+ return -ENOMEM;
+
dev->_tx = tx;
netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
@@ -7845,9 +7861,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1;
- p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
- if (!p)
- p = vzalloc(alloc_size);
+ p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_REPEAT);
if (!p)
return NULL;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 58b0bcc125b5..d274f81fcc2c 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1132,10 +1132,6 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
lladdr = neigh->ha;
}
- if (new & NUD_CONNECTED)
- neigh->confirmed = jiffies;
- neigh->updated = jiffies;
-
/* If entry was valid and address is not changed,
do not change entry state, if new one is STALE.
*/
@@ -1157,6 +1153,16 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
}
}
+ /* Update timestamps only once we know we will make a change to the
+ * neighbour entry. Otherwise we risk to move the locktime window with
+ * noop updates and ignore relevant ARP updates.
+ */
+ if (new != old || lladdr != neigh->ha) {
+ if (new & NUD_CONNECTED)
+ neigh->confirmed = jiffies;
+ neigh->updated = jiffies;
+ }
+
if (new != old) {
neigh_del_timer(neigh);
if (new & NUD_PROBE)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bcb0f610ee42..9e2c0a7cb325 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -899,8 +899,7 @@ static size_t rtnl_port_size(const struct net_device *dev,
static size_t rtnl_xdp_size(void)
{
size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */
- nla_total_size(1) + /* XDP_ATTACHED */
- nla_total_size(4); /* XDP_FLAGS */
+ nla_total_size(1); /* XDP_ATTACHED */
return xdp_size;
}
@@ -1247,37 +1246,34 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
return 0;
}
+static u8 rtnl_xdp_attached_mode(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ ASSERT_RTNL();
+
+ if (rcu_access_pointer(dev->xdp_prog))
+ return XDP_ATTACHED_SKB;
+ if (ops->ndo_xdp && __dev_xdp_attached(dev, ops->ndo_xdp))
+ return XDP_ATTACHED_DRV;
+
+ return XDP_ATTACHED_NONE;
+}
+
static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
{
struct nlattr *xdp;
- u32 xdp_flags = 0;
- u8 val = 0;
int err;
xdp = nla_nest_start(skb, IFLA_XDP);
if (!xdp)
return -EMSGSIZE;
- if (rcu_access_pointer(dev->xdp_prog)) {
- xdp_flags = XDP_FLAGS_SKB_MODE;
- val = 1;
- } else if (dev->netdev_ops->ndo_xdp) {
- struct netdev_xdp xdp_op = {};
-
- xdp_op.command = XDP_QUERY_PROG;
- err = dev->netdev_ops->ndo_xdp(dev, &xdp_op);
- if (err)
- goto err_cancel;
- val = xdp_op.prog_attached;
- }
- err = nla_put_u8(skb, IFLA_XDP_ATTACHED, val);
+
+ err = nla_put_u8(skb, IFLA_XDP_ATTACHED,
+ rtnl_xdp_attached_mode(dev));
if (err)
goto err_cancel;
- if (xdp_flags) {
- err = nla_put_u32(skb, IFLA_XDP_FLAGS, xdp_flags);
- if (err)
- goto err_cancel;
- }
nla_nest_end(skb, xdp);
return 0;
@@ -1631,13 +1627,13 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
cb->nlh->nlmsg_seq, 0,
flags,
ext_filter_mask);
- /* If we ran out of room on the first message,
- * we're in trouble
- */
- WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
- if (err < 0)
- goto out;
+ if (err < 0) {
+ if (likely(skb->len))
+ goto out;
+
+ goto out_err;
+ }
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
@@ -1645,10 +1641,12 @@ cont:
}
}
out:
+ err = skb->len;
+out_err:
cb->args[1] = idx;
cb->args[0] = h;
- return skb->len;
+ return err;
}
int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len,
@@ -2199,6 +2197,11 @@ static int do_setlink(const struct sk_buff *skb,
err = -EINVAL;
goto errout;
}
+ if ((xdp_flags & XDP_FLAGS_SKB_MODE) &&
+ (xdp_flags & XDP_FLAGS_DRV_MODE)) {
+ err = -EINVAL;
+ goto errout;
+ }
}
if (xdp[IFLA_XDP_FD]) {
@@ -3228,8 +3231,11 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
int err = 0;
int fidx = 0;
- if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb,
- IFLA_MAX, ifla_policy, NULL) == 0) {
+ err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb,
+ IFLA_MAX, ifla_policy, NULL);
+ if (err < 0) {
+ return -EINVAL;
+ } else if (err == 0) {
if (tb[IFLA_MASTER])
br_idx = nla_get_u32(tb[IFLA_MASTER]);
}
@@ -3452,8 +3458,12 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
err = br_dev->netdev_ops->ndo_bridge_getlink(
skb, portid, seq, dev,
filter_mask, NLM_F_MULTI);
- if (err < 0 && err != -EOPNOTSUPP)
- break;
+ if (err < 0 && err != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ break;
+
+ goto out_err;
+ }
}
idx++;
}
@@ -3464,16 +3474,22 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
seq, dev,
filter_mask,
NLM_F_MULTI);
- if (err < 0 && err != -EOPNOTSUPP)
- break;
+ if (err < 0 && err != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ break;
+
+ goto out_err;
+ }
}
idx++;
}
}
+ err = skb->len;
+out_err:
rcu_read_unlock();
cb->args[0] = idx;
- return skb->len;
+ return err;
}
static inline size_t bridge_nlmsg_size(void)
diff --git a/net/core/sock.c b/net/core/sock.c
index b5baeb9cb0fb..727f924b7f91 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -102,6 +102,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -138,10 +139,7 @@
#include <trace/events/sock.h>
-#ifdef CONFIG_INET
#include <net/tcp.h>
-#endif
-
#include <net/busy_poll.h>
static DEFINE_MUTEX(proto_list_mutex);
@@ -372,14 +370,14 @@ EXPORT_SYMBOL_GPL(sk_clear_memalloc);
int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
int ret;
- unsigned long pflags = current->flags;
+ unsigned int noreclaim_flag;
/* these should have been dropped before queueing */
BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
ret = sk->sk_backlog_rcv(sk, skb);
- current_restore_flags(pflags, PF_MEMALLOC);
+ memalloc_noreclaim_restore(noreclaim_flag);
return ret;
}
@@ -1802,28 +1800,24 @@ EXPORT_SYMBOL(skb_set_owner_w);
* delay queue. We want to allow the owner socket to send more
* packets, as if they were already TX completed by a typical driver.
* But we also want to keep skb->sk set because some packet schedulers
- * rely on it (sch_fq for example). So we set skb->truesize to a small
- * amount (1) and decrease sk_wmem_alloc accordingly.
+ * rely on it (sch_fq for example).
*/
void skb_orphan_partial(struct sk_buff *skb)
{
- /* If this skb is a TCP pure ACK or already went here,
- * we have nothing to do. 2 is already a very small truesize.
- */
- if (skb->truesize <= 2)
+ if (skb_is_tcp_pure_ack(skb))
return;
- /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
- * so we do not completely orphan skb, but transfert all
- * accounted bytes but one, to avoid unexpected reorders.
- */
if (skb->destructor == sock_wfree
#ifdef CONFIG_INET
|| skb->destructor == tcp_wfree
#endif
) {
- atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
- skb->truesize = 1;
+ struct sock *sk = skb->sk;
+
+ if (atomic_inc_not_zero(&sk->sk_refcnt)) {
+ atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
+ skb->destructor = sock_efree;
+ }
} else {
skb_orphan(skb);
}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b99168b0fabf..f75482bdee9a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -951,7 +951,7 @@ static struct proto dccp_v4_prot = {
.orphan_count = &dccp_orphan_count,
.max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.rsk_prot = &dccp_request_sock_ops,
.twsk_prot = &dccp_timewait_sock_ops,
.h.hashinfo = &dccp_hashinfo,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index d9b6a4e403e7..992621172220 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -426,6 +426,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
newsk->sk_backlog_rcv = dccp_v4_do_rcv;
newnp->pktoptions = NULL;
newnp->opt = NULL;
+ newnp->ipv6_mc_list = NULL;
+ newnp->ipv6_ac_list = NULL;
+ newnp->ipv6_fl_list = NULL;
newnp->mcast_oif = inet6_iif(skb);
newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
@@ -490,6 +493,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
/* Clone RX bits */
newnp->rxopt.all = np->rxopt.all;
+ newnp->ipv6_mc_list = NULL;
+ newnp->ipv6_ac_list = NULL;
+ newnp->ipv6_fl_list = NULL;
newnp->pktoptions = NULL;
newnp->opt = NULL;
newnp->mcast_oif = inet6_iif(skb);
@@ -1014,7 +1020,7 @@ static struct proto dccp_v6_prot = {
.orphan_count = &dccp_orphan_count,
.max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp6_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.rsk_prot = &dccp6_request_sock_ops,
.twsk_prot = &dccp6_timewait_sock_ops,
.h.hashinfo = &dccp_hashinfo,
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 9afa2a5030b2..405483a07efc 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -2361,7 +2361,8 @@ MODULE_AUTHOR("Linux DECnet Project Team");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_DECnet);
-static char banner[] __initdata = KERN_INFO "NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n";
+static const char banner[] __initconst = KERN_INFO
+"NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n";
static int __init decnet_init(void)
{
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 482730cd8a56..eeb5fc561f80 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -110,7 +110,7 @@ struct neigh_table dn_neigh_table = {
static int dn_neigh_construct(struct neighbour *neigh)
{
struct net_device *dev = neigh->dev;
- struct dn_neigh *dn = (struct dn_neigh *)neigh;
+ struct dn_neigh *dn = container_of(neigh, struct dn_neigh, n);
struct dn_dev *dn_db;
struct neigh_parms *parms;
@@ -339,7 +339,7 @@ int dn_to_neigh_output(struct net *net, struct sock *sk, struct sk_buff *skb)
struct dst_entry *dst = skb_dst(skb);
struct dn_route *rt = (struct dn_route *) dst;
struct neighbour *neigh = rt->n;
- struct dn_neigh *dn = (struct dn_neigh *)neigh;
+ struct dn_neigh *dn = container_of(neigh, struct dn_neigh, n);
struct dn_dev *dn_db;
bool use_long;
@@ -391,7 +391,7 @@ int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb)
neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
- dn = (struct dn_neigh *)neigh;
+ dn = container_of(neigh, struct dn_neigh, n);
if (neigh) {
write_lock(&neigh->lock);
@@ -451,7 +451,7 @@ int dn_neigh_endnode_hello(struct net *net, struct sock *sk, struct sk_buff *skb
neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
- dn = (struct dn_neigh *)neigh;
+ dn = container_of(neigh, struct dn_neigh, n);
if (neigh) {
write_lock(&neigh->lock);
@@ -510,7 +510,7 @@ static void neigh_elist_cb(struct neighbour *neigh, void *_info)
if (neigh->dev != s->dev)
return;
- dn = (struct dn_neigh *) neigh;
+ dn = container_of(neigh, struct dn_neigh, n);
if (!(dn->flags & (DN_NDFLAG_R1|DN_NDFLAG_R2)))
return;
@@ -549,7 +549,7 @@ int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n)
static inline void dn_neigh_format_entry(struct seq_file *seq,
struct neighbour *n)
{
- struct dn_neigh *dn = (struct dn_neigh *) n;
+ struct dn_neigh *dn = container_of(n, struct dn_neigh, n);
char buf[DN_ASCBUF_LEN];
read_lock(&n->lock);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 0937b34c27ca..ae96e6f3e0cb 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -641,6 +641,32 @@ void arp_xmit(struct sk_buff *skb)
}
EXPORT_SYMBOL(arp_xmit);
+static bool arp_is_garp(struct net *net, struct net_device *dev,
+ int *addr_type, __be16 ar_op,
+ __be32 sip, __be32 tip,
+ unsigned char *sha, unsigned char *tha)
+{
+ bool is_garp = tip == sip;
+
+ /* Gratuitous ARP _replies_ also require target hwaddr to be
+ * the same as source.
+ */
+ if (is_garp && ar_op == htons(ARPOP_REPLY))
+ is_garp =
+ /* IPv4 over IEEE 1394 doesn't provide target
+ * hardware address field in its ARP payload.
+ */
+ tha &&
+ !memcmp(tha, sha, dev->addr_len);
+
+ if (is_garp) {
+ *addr_type = inet_addr_type_dev_table(net, dev, sip);
+ if (*addr_type != RTN_UNICAST)
+ is_garp = false;
+ }
+ return is_garp;
+}
+
/*
* Process an arp request.
*/
@@ -653,6 +679,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
unsigned char *arp_ptr;
struct rtable *rt;
unsigned char *sha;
+ unsigned char *tha = NULL;
__be32 sip, tip;
u16 dev_type = dev->type;
int addr_type;
@@ -724,6 +751,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
break;
#endif
default:
+ tha = arp_ptr;
arp_ptr += dev->addr_len;
}
memcpy(&tip, arp_ptr, 4);
@@ -835,19 +863,25 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
- if (IN_DEV_ARP_ACCEPT(in_dev)) {
- unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip);
+ if (n || IN_DEV_ARP_ACCEPT(in_dev)) {
+ addr_type = -1;
+ is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
+ sip, tip, sha, tha);
+ }
+ if (IN_DEV_ARP_ACCEPT(in_dev)) {
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
*/
- is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip &&
- addr_type == RTN_UNICAST;
-
if (!n &&
- ((arp->ar_op == htons(ARPOP_REPLY) &&
- addr_type == RTN_UNICAST) || is_garp))
+ (is_garp ||
+ (arp->ar_op == htons(ARPOP_REPLY) &&
+ (addr_type == RTN_UNICAST ||
+ (addr_type < 0 &&
+ /* postpone calculation to as late as possible */
+ inet_addr_type_dev_table(net, dev, sip) ==
+ RTN_UNICAST)))))
n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 65cc02bd82bc..93322f895eab 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -248,6 +248,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
u8 *tail;
u8 *vaddr;
int nfrags;
+ int esph_offset;
struct page *page;
struct sk_buff *trailer;
int tailen = esp->tailen;
@@ -313,11 +314,13 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
}
cow:
+ esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);
+
nfrags = skb_cow_data(skb, tailen, &trailer);
if (nfrags < 0)
goto out;
tail = skb_tail_pointer(trailer);
- esp->esph = ip_esp_hdr(skb);
+ esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);
skip_cow:
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 39bd1edee676..83e3ed258467 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -763,7 +763,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
unsigned int e = 0, s_e;
struct fib_table *tb;
struct hlist_head *head;
- int dumped = 0;
+ int dumped = 0, err;
if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
@@ -783,20 +783,27 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
if (dumped)
memset(&cb->args[2], 0, sizeof(cb->args) -
2 * sizeof(cb->args[0]));
- if (fib_table_dump(tb, skb, cb) < 0)
- goto out;
+ err = fib_table_dump(tb, skb, cb);
+ if (err < 0) {
+ if (likely(skb->len))
+ goto out;
+
+ goto out_err;
+ }
dumped = 1;
next:
e++;
}
}
out:
+ err = skb->len;
+out_err:
rcu_read_unlock();
cb->args[1] = e;
cb->args[0] = h;
- return skb->len;
+ return err;
}
/* Prepare and feed intra-kernel routing request.
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 1201409ba1dc..51182ff2b441 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1983,6 +1983,8 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
/* rcu_read_lock is hold by caller */
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ int err;
+
if (i < s_i) {
i++;
continue;
@@ -1993,17 +1995,14 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
continue;
}
- if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWROUTE,
- tb->tb_id,
- fa->fa_type,
- xkey,
- KEYLENGTH - fa->fa_slen,
- fa->fa_tos,
- fa->fa_info, NLM_F_MULTI) < 0) {
+ err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+ tb->tb_id, fa->fa_type,
+ xkey, KEYLENGTH - fa->fa_slen,
+ fa->fa_tos, fa->fa_info, NLM_F_MULTI);
+ if (err < 0) {
cb->args[4] = i;
- return -1;
+ return err;
}
i++;
}
@@ -2025,10 +2024,13 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
t_key key = cb->args[3];
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
- if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
+ int err;
+
+ err = fn_trie_dump_leaf(l, tb, skb, cb);
+ if (err < 0) {
cb->args[3] = key;
cb->args[2] = count;
- return -1;
+ return err;
}
++count;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 5e313c1ac94f..1054d330bf9d 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -794,6 +794,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
/* listeners have SOCK_RCU_FREE, not the children */
sock_reset_flag(newsk, SOCK_RCU_FREE);
+ inet_sk(newsk)->mc_list = NULL;
+
newsk->sk_mark = inet_rsk(req)->ir_mark;
atomic64_set(&newsk->sk_cookie,
atomic64_read(&inet_rsk(req)->ir_cookie));
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 8bea74298173..e9a59d2d91d4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -678,11 +678,7 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
/* no more locks than number of hash buckets */
nblocks = min(nblocks, hashinfo->ehash_mask + 1);
- hashinfo->ehash_locks = kmalloc_array(nblocks, locksz,
- GFP_KERNEL | __GFP_NOWARN);
- if (!hashinfo->ehash_locks)
- hashinfo->ehash_locks = vmalloc(nblocks * locksz);
-
+ hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
if (!hashinfo->ehash_locks)
return -ENOMEM;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3a02d52ed50e..551de4d023a8 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1980,6 +1980,20 @@ int ip_mr_input(struct sk_buff *skb)
struct net *net = dev_net(skb->dev);
int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
struct mr_table *mrt;
+ struct net_device *dev;
+
+ /* skb->dev passed in is the loX master dev for vrfs.
+ * As there are no vifs associated with loopback devices,
+ * get the proper interface that does have a vif associated with it.
+ */
+ dev = skb->dev;
+ if (netif_is_l3_master(skb->dev)) {
+ dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
+ if (!dev) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+ }
/* Packet is looped back after forward, it should not be
* forwarded second time, but still can be delivered locally.
@@ -2017,7 +2031,7 @@ int ip_mr_input(struct sk_buff *skb)
/* already under rcu_read_lock() */
cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
if (!cache) {
- int vif = ipmr_find_vif(mrt, skb->dev);
+ int vif = ipmr_find_vif(mrt, dev);
if (vif >= 0)
cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
@@ -2037,7 +2051,7 @@ int ip_mr_input(struct sk_buff *skb)
}
read_lock(&mrt_lock);
- vif = ipmr_find_vif(mrt, skb->dev);
+ vif = ipmr_find_vif(mrt, dev);
if (vif >= 0) {
int err2 = ipmr_cache_unresolved(mrt, vif, skb);
read_unlock(&mrt_lock);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f647310f8e4d..655d9eebe43e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1387,11 +1387,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
{
struct rtable *rt = (struct rtable *) dst;
- if (rt->fi) {
- fib_info_put(rt->fi);
- rt->fi = NULL;
- }
-
if (!list_empty(&rt->rt_uncached)) {
struct uncached_list *ul = rt->rt_uncached_list;
@@ -1429,16 +1424,6 @@ static bool rt_cache_valid(const struct rtable *rt)
!rt_is_expired(rt);
}
-static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
-{
- if (fi->fib_metrics != (u32 *)dst_default_metrics) {
- fib_info_hold(fi);
- rt->fi = fi;
- }
-
- dst_init_metrics(&rt->dst, fi->fib_metrics, true);
-}
-
static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
const struct fib_result *res,
struct fib_nh_exception *fnhe,
@@ -1453,7 +1438,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
rt->rt_gateway = nh->nh_gw;
rt->rt_uses_gateway = 1;
}
- rt_init_metrics(rt, fi);
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
#endif
@@ -1505,7 +1490,6 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
rt->rt_gateway = 0;
rt->rt_uses_gateway = 0;
rt->rt_table_id = 0;
- rt->fi = NULL;
INIT_LIST_HEAD(&rt->rt_uncached);
rt->dst.output = ip_output;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1e4c76d2b827..842b575f8fdd 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2320,6 +2320,10 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
inet_csk_delack_init(sk);
+ /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
+ * issue in __tcp_select_window()
+ */
+ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
tcp_init_send_head(sk);
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5a3ad09e2786..174d4376baa5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1179,13 +1179,14 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
*/
if (pkt_len > mss) {
unsigned int new_len = (pkt_len / mss) * mss;
- if (!in_sack && new_len < pkt_len) {
+ if (!in_sack && new_len < pkt_len)
new_len += mss;
- if (new_len >= skb->len)
- return 0;
- }
pkt_len = new_len;
}
+
+ if (pkt_len >= skb->len && !in_sack)
+ return 0;
+
err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
if (err < 0)
return err;
@@ -3189,7 +3190,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
int delta;
/* Non-retransmitted hole got filled? That's reordering */
- if (reord < prior_fackets)
+ if (reord < prior_fackets && reord <= tp->fackets_out)
tcp_update_reordering(sk, tp->fackets_out - reord, 0);
delta = tcp_is_fack(tp) ? pkts_acked :
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3a51582bef55..5ab2aac5ca19 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2395,7 +2395,7 @@ struct proto tcp_prot = {
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 9d0d4f39e42b..653bbd67e3a3 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1011,10 +1011,7 @@ static int __net_init tcp_net_metrics_init(struct net *net)
tcp_metrics_hash_log = order_base_2(slots);
size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log;
- tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
- if (!tcp_metrics_hash)
- tcp_metrics_hash = vzalloc(size);
-
+ tcp_metrics_hash = kvzalloc(size, GFP_KERNEL);
if (!tcp_metrics_hash)
return -ENOMEM;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ea6e4cff9faf..1d6219bf2d6b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1612,7 +1612,7 @@ static void udp_v4_rehash(struct sock *sk)
udp_lib_rehash(sk, new_hash);
}
-int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int rc;
@@ -1657,7 +1657,7 @@ EXPORT_SYMBOL(udp_encap_enable);
* Note that in the success and error cases, the skb is assumed to
* have either been requeued or freed.
*/
-int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index feb50a16398d..a8cf8c6fb60c 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -25,7 +25,6 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int flags, int *addr_len);
int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
-int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
void udp_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 77a4bd526d6e..6a4fb1e629fb 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1022,7 +1022,10 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
INIT_HLIST_NODE(&ifa->addr_lst);
ifa->scope = scope;
ifa->prefix_len = pfxlen;
- ifa->flags = flags | IFA_F_TENTATIVE;
+ ifa->flags = flags;
+ /* No need to add the TENTATIVE flag for addresses with NODAD */
+ if (!(flags & IFA_F_NODAD))
+ ifa->flags |= IFA_F_TENTATIVE;
ifa->valid_lft = valid_lft;
ifa->prefered_lft = prefered_lft;
ifa->cstamp = ifa->tstamp = jiffies;
@@ -3548,6 +3551,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
*/
static struct notifier_block ipv6_dev_notf = {
.notifier_call = addrconf_notify,
+ .priority = ADDRCONF_NOTIFY_PRIORITY,
};
static void addrconf_type_change(struct net_device *dev, unsigned long event)
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index af8f52ee7180..2fd5ca151dcf 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -41,13 +41,7 @@ static int alloc_ila_locks(struct ila_net *ilan)
size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU);
if (sizeof(spinlock_t) != 0) {
-#ifdef CONFIG_NUMA
- if (size * sizeof(spinlock_t) > PAGE_SIZE)
- ilan->locks = vmalloc(size * sizeof(spinlock_t));
- else
-#endif
- ilan->locks = kmalloc_array(size, sizeof(spinlock_t),
- GFP_KERNEL);
+ ilan->locks = kvmalloc(size * sizeof(spinlock_t), GFP_KERNEL);
if (!ilan->locks)
return -ENOMEM;
for (i = 0; i < size; i++)
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 93e58a5e1837..280268f1dd7b 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -63,7 +63,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
const struct net_offload *ops;
int proto;
struct frag_hdr *fptr;
- unsigned int unfrag_ip6hlen;
unsigned int payload_len;
u8 *prevhdr;
int offset = 0;
@@ -116,8 +115,10 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
skb->network_header = (u8 *)ipv6h - skb->head;
if (udpfrag) {
- unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
- fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen);
+ int err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0)
+ return ERR_PTR(err);
+ fptr = (struct frag_hdr *)((u8 *)ipv6h + err);
fptr->frag_off = htons(offset);
if (skb->next)
fptr->frag_off |= htons(IP6_MF);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 58f6288e9ba5..bf8a58a1c32d 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -597,7 +597,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
int ptr, offset = 0, err = 0;
u8 *prevhdr, nexthdr = 0;
- hlen = ip6_find_1stfragopt(skb, &prevhdr);
+ err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0)
+ goto fail;
+ hlen = err;
nexthdr = *prevhdr;
mtu = ip6_skb_dst_mtu(skb);
@@ -1463,6 +1466,11 @@ alloc_new_skb:
*/
alloclen += sizeof(struct frag_hdr);
+ copy = datalen - transhdrlen - fraggap;
+ if (copy < 0) {
+ err = -EINVAL;
+ goto error;
+ }
if (transhdrlen) {
skb = sock_alloc_send_skb(sk,
alloclen + hh_len,
@@ -1512,13 +1520,9 @@ alloc_new_skb:
data += fraggap;
pskb_trim_unique(skb_prev, maxfraglen);
}
- copy = datalen - transhdrlen - fraggap;
-
- if (copy < 0) {
- err = -EINVAL;
- kfree_skb(skb);
- goto error;
- } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+ if (copy > 0 &&
+ getfrag(from, data + transhdrlen, offset,
+ copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index cd4252346a32..e9065b8d3af8 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -79,14 +79,13 @@ EXPORT_SYMBOL(ipv6_select_ident);
int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
{
u16 offset = sizeof(struct ipv6hdr);
- struct ipv6_opt_hdr *exthdr =
- (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
unsigned int packet_len = skb_tail_pointer(skb) -
skb_network_header(skb);
int found_rhdr = 0;
*nexthdr = &ipv6_hdr(skb)->nexthdr;
- while (offset + 1 <= packet_len) {
+ while (offset <= packet_len) {
+ struct ipv6_opt_hdr *exthdr;
switch (**nexthdr) {
@@ -107,13 +106,16 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
return offset;
}
- offset += ipv6_optlen(exthdr);
- *nexthdr = &exthdr->nexthdr;
+ if (offset + sizeof(struct ipv6_opt_hdr) > packet_len)
+ return -EINVAL;
+
exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
offset);
+ offset += ipv6_optlen(exthdr);
+ *nexthdr = &exthdr->nexthdr;
}
- return offset;
+ return -EINVAL;
}
EXPORT_SYMBOL(ip6_find_1stfragopt);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 2f1136627dcb..dc61b0b5e64e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3709,7 +3709,10 @@ static int ip6_route_dev_notify(struct notifier_block *this,
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
- if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
+ if (!(dev->flags & IFF_LOOPBACK))
+ return NOTIFY_OK;
+
+ if (event == NETDEV_REGISTER) {
net->ipv6.ip6_null_entry->dst.dev = dev;
net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -3718,6 +3721,12 @@ static int ip6_route_dev_notify(struct notifier_block *this,
net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
#endif
+ } else if (event == NETDEV_UNREGISTER) {
+ in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev);
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+ in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev);
+ in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev);
+#endif
}
return NOTIFY_OK;
@@ -4024,7 +4033,7 @@ static struct pernet_operations ip6_route_net_late_ops = {
static struct notifier_block ip6_route_dev_notifier = {
.notifier_call = ip6_route_dev_notify,
- .priority = 0,
+ .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
};
void __init ip6_route_init_special_entries(void)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index aeb9497b5bb7..4f4310a36a04 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1062,6 +1062,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif
+ newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
newnp->pktoptions = NULL;
@@ -1131,6 +1132,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
First: no IPv4 options.
*/
newinet->inet_opt = NULL;
+ newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
@@ -1917,7 +1919,7 @@ struct proto tcpv6_prot = {
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp6_timewait_sock_ops,
.rsk_prot = &tcp6_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 04862abfe4ec..06ec39b79609 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -526,7 +526,7 @@ out:
return;
}
-int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int rc;
@@ -569,7 +569,7 @@ void udpv6_encap_enable(void)
}
EXPORT_SYMBOL(udpv6_encap_enable);
-int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index e78bdc76dcc3..f180b3d85e31 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -26,7 +26,6 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int flags, int *addr_len);
-int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
void udpv6_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index ac858c480f2f..a2267f80febb 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -29,6 +29,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
u8 frag_hdr_sz = sizeof(struct frag_hdr);
__wsum csum;
int tnl_hlen;
+ int err;
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
@@ -90,7 +91,10 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
/* Find the unfragmentable header and shift it left by frag_hdr_sz
* bytes to insert fragment header.
*/
- unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
+ err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0)
+ return ERR_PTR(err);
+ unfrag_ip6hlen = err;
nexthdr = *prevhdr;
*prevhdr = NEXTHDR_FRAGMENT;
unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
diff --git a/net/key/af_key.c b/net/key/af_key.c
index c1950bb14735..512dc43d0ce6 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3285,7 +3285,7 @@ static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt,
p += pol->sadb_x_policy_len*8;
sec_ctx = (struct sadb_x_sec_ctx *)p;
if (len < pol->sadb_x_policy_len*8 +
- sec_ctx->sadb_x_sec_len) {
+ sec_ctx->sadb_x_sec_len*8) {
*dir = -EINVAL;
goto out;
}
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index cb4fff785cbf..8364fe5b59e4 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -142,7 +142,7 @@ static struct proto llc_proto = {
.name = "LLC",
.owner = THIS_MODULE,
.obj_size = sizeof(struct llc_sock),
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
};
/**
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 8bc5a1bd2d45..9b02c13d258b 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -506,7 +506,7 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap,
again:
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
if (llc_estab_match(sap, daddr, laddr, rc)) {
- /* Extra checks required by SLAB_DESTROY_BY_RCU */
+ /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
goto again;
if (unlikely(llc_sk(rc)->sap != sap ||
@@ -565,7 +565,7 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap,
again:
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
if (llc_listener_match(sap, laddr, rc)) {
- /* Extra checks required by SLAB_DESTROY_BY_RCU */
+ /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
goto again;
if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 5404d0d195cc..63b6ab056370 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -328,7 +328,7 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap,
again:
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
if (llc_dgram_match(sap, laddr, rc)) {
- /* Extra checks required by SLAB_DESTROY_BY_RCU */
+ /* Extra checks required by SLAB_TYPESAFE_BY_RCU */
if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
goto again;
if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 088e2b459d0f..257ec66009da 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2005,10 +2005,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
unsigned index;
if (size) {
- labels = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
- if (!labels)
- labels = vzalloc(size);
-
+ labels = kvzalloc(size, GFP_KERNEL);
if (!labels)
goto nolabels;
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index d2d7bdf1d510..ad99c1ceea6f 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -849,10 +849,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
{
unsigned int verdict = NF_DROP;
- if (IP_VS_FWD_METHOD(cp) != 0) {
- pr_err("shouldn't reach here, because the box is on the "
- "half connection in the tun/dr module.\n");
- }
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ goto ignore_cp;
/* Ensure the checksum is correct */
if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
@@ -886,6 +884,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
ip_vs_notrack(skb);
else
ip_vs_update_conntrack(skb, cp, 0);
+
+ignore_cp:
verdict = NF_ACCEPT;
out:
@@ -1385,8 +1385,11 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
*/
cp = pp->conn_out_get(ipvs, af, skb, &iph);
- if (likely(cp))
+ if (likely(cp)) {
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ goto ignore_cp;
return handle_response(af, skb, pd, cp, &iph, hooknum);
+ }
/* Check for real-server-started requests */
if (atomic_read(&ipvs->conn_out_counter)) {
@@ -1444,9 +1447,15 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
}
}
}
+
+out:
IP_VS_DBG_PKT(12, af, pp, skb, iph.off,
"ip_vs_out: packet continues traversal as normal");
return NF_ACCEPT;
+
+ignore_cp:
+ __ip_vs_conn_put(cp);
+ goto out;
}
/*
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3c8f1ed2f555..e847dbaa0c6b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -911,7 +911,7 @@ static unsigned int early_drop_list(struct net *net,
continue;
/* kill only if still in same netns -- might have moved due to
- * SLAB_DESTROY_BY_RCU rules.
+ * SLAB_TYPESAFE_BY_RCU rules.
*
* We steal the timer reference. If that fails timer has
* already fired or someone else deleted it. Just drop ref
@@ -1114,7 +1114,7 @@ __nf_conntrack_alloc(struct net *net,
/*
* Do not use kmem_cache_zalloc(), as this cache uses
- * SLAB_DESTROY_BY_RCU.
+ * SLAB_TYPESAFE_BY_RCU.
*/
ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
if (ct == NULL)
@@ -1159,7 +1159,7 @@ void nf_conntrack_free(struct nf_conn *ct)
struct net *net = nf_ct_net(ct);
/* A freed object has refcnt == 0, that's
- * the golden rule for SLAB_DESTROY_BY_RCU
+ * the golden rule for SLAB_TYPESAFE_BY_RCU
*/
NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
@@ -1929,7 +1929,7 @@ int nf_conntrack_init_start(void)
nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
sizeof(struct nf_conn),
NFCT_INFOMASK + 1,
- SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
+ SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
if (!nf_conntrack_cachep)
goto err_cachep;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 3a60efa7799b..7f6100ca63be 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -174,6 +174,10 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
#endif
if (h != NULL && !try_module_get(h->me))
h = NULL;
+ if (h != NULL && !refcount_inc_not_zero(&h->refcnt)) {
+ module_put(h->me);
+ h = NULL;
+ }
rcu_read_unlock();
@@ -181,6 +185,13 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum)
}
EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get);
+void nf_conntrack_helper_put(struct nf_conntrack_helper *helper)
+{
+ refcount_dec(&helper->refcnt);
+ module_put(helper->me);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_helper_put);
+
struct nf_conn_help *
nf_ct_helper_ext_add(struct nf_conn *ct,
struct nf_conntrack_helper *helper, gfp_t gfp)
@@ -417,6 +428,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
}
}
}
+ refcount_set(&me->refcnt, 1);
hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]);
nf_ct_helper_count++;
out:
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index dcf561b5c97a..9799a50bc604 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -45,6 +45,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_timestamp.h>
#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_synproxy.h>
#ifdef CONFIG_NF_NAT_NEEDED
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_l4proto.h>
@@ -1007,9 +1009,8 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = {
static int
ctnetlink_parse_tuple(const struct nlattr * const cda[],
- struct nf_conntrack_tuple *tuple,
- enum ctattr_type type, u_int8_t l3num,
- struct nf_conntrack_zone *zone)
+ struct nf_conntrack_tuple *tuple, u32 type,
+ u_int8_t l3num, struct nf_conntrack_zone *zone)
{
struct nlattr *tb[CTA_TUPLE_MAX+1];
int err;
@@ -1828,6 +1829,8 @@ ctnetlink_create_conntrack(struct net *net,
nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
nf_ct_labels_ext_add(ct);
+ nfct_seqadj_ext_add(ct);
+ nfct_synproxy_ext_add(ct);
/* we must add conntrack extensions before confirmation. */
ct->status |= IPS_CONFIRMED;
@@ -2447,7 +2450,7 @@ static struct nfnl_ct_hook ctnetlink_glue_hook = {
static int ctnetlink_exp_dump_tuple(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
- enum ctattr_expect type)
+ u32 type)
{
struct nlattr *nest_parms;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index b48d6b5aae8a..ef0be325a0c6 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -409,6 +409,10 @@ nf_nat_setup_info(struct nf_conn *ct,
{
struct nf_conntrack_tuple curr_tuple, new_tuple;
+ /* Can't setup nat info for confirmed ct. */
+ if (nf_ct_is_confirmed(ct))
+ return NF_ACCEPT;
+
NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC ||
maniptype == NF_NAT_MANIP_DST);
BUG_ON(nf_nat_initialized(ct, maniptype));
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 559225029740..da314be0c048 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3367,35 +3367,50 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
return nf_tables_fill_setelem(args->skb, set, elem);
}
+struct nft_set_dump_ctx {
+ const struct nft_set *set;
+ struct nft_ctx ctx;
+};
+
static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct nft_set_dump_ctx *dump_ctx = cb->data;
struct net *net = sock_net(skb->sk);
- u8 genmask = nft_genmask_cur(net);
+ struct nft_af_info *afi;
+ struct nft_table *table;
struct nft_set *set;
struct nft_set_dump_args args;
- struct nft_ctx ctx;
- struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1];
+ bool set_found = false;
struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
struct nlattr *nest;
u32 portid, seq;
- int event, err;
+ int event;
- err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla,
- NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy,
- NULL);
- if (err < 0)
- return err;
+ rcu_read_lock();
+ list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+ if (afi != dump_ctx->ctx.afi)
+ continue;
- err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh,
- (void *)nla, genmask);
- if (err < 0)
- return err;
+ list_for_each_entry_rcu(table, &afi->tables, list) {
+ if (table != dump_ctx->ctx.table)
+ continue;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
- genmask);
- if (IS_ERR(set))
- return PTR_ERR(set);
+ list_for_each_entry_rcu(set, &table->sets, list) {
+ if (set == dump_ctx->set) {
+ set_found = true;
+ break;
+ }
+ }
+ break;
+ }
+ break;
+ }
+
+ if (!set_found) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWSETELEM);
portid = NETLINK_CB(cb->skb).portid;
@@ -3407,11 +3422,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
goto nla_put_failure;
nfmsg = nlmsg_data(nlh);
- nfmsg->nfgen_family = ctx.afi->family;
+ nfmsg->nfgen_family = afi->family;
nfmsg->version = NFNETLINK_V0;
- nfmsg->res_id = htons(ctx.net->nft.base_seq & 0xffff);
+ nfmsg->res_id = htons(net->nft.base_seq & 0xffff);
- if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name))
+ if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name))
goto nla_put_failure;
if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name))
goto nla_put_failure;
@@ -3422,12 +3437,13 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
args.cb = cb;
args.skb = skb;
- args.iter.genmask = nft_genmask_cur(ctx.net);
+ args.iter.genmask = nft_genmask_cur(net);
args.iter.skip = cb->args[0];
args.iter.count = 0;
args.iter.err = 0;
args.iter.fn = nf_tables_dump_setelem;
- set->ops->walk(&ctx, set, &args.iter);
+ set->ops->walk(&dump_ctx->ctx, set, &args.iter);
+ rcu_read_unlock();
nla_nest_end(skb, nest);
nlmsg_end(skb, nlh);
@@ -3441,9 +3457,16 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
nla_put_failure:
+ rcu_read_unlock();
return -ENOSPC;
}
+static int nf_tables_dump_set_done(struct netlink_callback *cb)
+{
+ kfree(cb->data);
+ return 0;
+}
+
static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
@@ -3465,7 +3488,18 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_set,
+ .done = nf_tables_dump_set_done,
};
+ struct nft_set_dump_ctx *dump_ctx;
+
+ dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL);
+ if (!dump_ctx)
+ return -ENOMEM;
+
+ dump_ctx->set = set;
+ dump_ctx->ctx = ctx;
+
+ c.data = dump_ctx;
return netlink_dump_start(nlsk, skb, nlh, &c);
}
return -EOPNOTSUPP;
@@ -3593,9 +3627,9 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
{
struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
- nft_data_uninit(nft_set_ext_key(ext), NFT_DATA_VALUE);
+ nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
- nft_data_uninit(nft_set_ext_data(ext), set->dtype);
+ nft_data_release(nft_set_ext_data(ext), set->dtype);
if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
@@ -3604,6 +3638,18 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
+/* Only called from commit path, nft_set_elem_deactivate() already deals with
+ * the refcounting from the preparation phase.
+ */
+static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem)
+{
+ struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
+ nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
+ kfree(elem);
+}
+
static int nft_setelem_parse_flags(const struct nft_set *set,
const struct nlattr *attr, u32 *flags)
{
@@ -3815,9 +3861,9 @@ err4:
kfree(elem.priv);
err3:
if (nla[NFTA_SET_ELEM_DATA] != NULL)
- nft_data_uninit(&data, d2.type);
+ nft_data_release(&data, d2.type);
err2:
- nft_data_uninit(&elem.key.val, d1.type);
+ nft_data_release(&elem.key.val, d1.type);
err1:
return err;
}
@@ -3862,6 +3908,53 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
return err;
}
+/**
+ * nft_data_hold - hold a nft_data item
+ *
+ * @data: struct nft_data to release
+ * @type: type of data
+ *
+ * Hold a nft_data item. NFT_DATA_VALUE types can be silently discarded,
+ * NFT_DATA_VERDICT bumps the reference to chains in case of NFT_JUMP and
+ * NFT_GOTO verdicts. This function must be called on active data objects
+ * from the second phase of the commit protocol.
+ */
+static void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
+{
+ if (type == NFT_DATA_VERDICT) {
+ switch (data->verdict.code) {
+ case NFT_JUMP:
+ case NFT_GOTO:
+ data->verdict.chain->use++;
+ break;
+ }
+ }
+}
+
+static void nft_set_elem_activate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_set_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
+ nft_data_hold(nft_set_ext_data(ext), set->dtype);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
+ (*nft_set_ext_obj(ext))->use++;
+}
+
+static void nft_set_elem_deactivate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_set_elem *elem)
+{
+ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
+ nft_data_release(nft_set_ext_data(ext), set->dtype);
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
+ (*nft_set_ext_obj(ext))->use--;
+}
+
static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
@@ -3927,6 +4020,8 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
kfree(elem.priv);
elem.priv = priv;
+ nft_set_elem_deactivate(ctx->net, set, &elem);
+
nft_trans_elem(trans) = elem;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
@@ -3936,7 +4031,7 @@ err4:
err3:
kfree(elem.priv);
err2:
- nft_data_uninit(&elem.key.val, desc.type);
+ nft_data_release(&elem.key.val, desc.type);
err1:
return err;
}
@@ -4743,8 +4838,8 @@ static void nf_tables_commit_release(struct nft_trans *trans)
nft_set_destroy(nft_trans_set(trans));
break;
case NFT_MSG_DELSETELEM:
- nft_set_elem_destroy(nft_trans_elem_set(trans),
- nft_trans_elem(trans).priv, true);
+ nf_tables_set_elem_destroy(nft_trans_elem_set(trans),
+ nft_trans_elem(trans).priv);
break;
case NFT_MSG_DELOBJ:
nft_obj_destroy(nft_trans_obj(trans));
@@ -4979,6 +5074,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
case NFT_MSG_DELSETELEM:
te = (struct nft_trans_elem *)trans->data;
+ nft_set_elem_activate(net, te->set, &te->elem);
te->set->ops->activate(net, te->set, &te->elem);
te->set->ndeact--;
@@ -5464,7 +5560,7 @@ int nft_data_init(const struct nft_ctx *ctx,
EXPORT_SYMBOL_GPL(nft_data_init);
/**
- * nft_data_uninit - release a nft_data item
+ * nft_data_release - release a nft_data item
*
* @data: struct nft_data to release
* @type: type of data
@@ -5472,7 +5568,7 @@ EXPORT_SYMBOL_GPL(nft_data_init);
* Release a nft_data item. NFT_DATA_VALUE types can be silently discarded,
* all others need to be released by calling this function.
*/
-void nft_data_uninit(const struct nft_data *data, enum nft_data_types type)
+void nft_data_release(const struct nft_data *data, enum nft_data_types type)
{
if (type < NFT_DATA_VERDICT)
return;
@@ -5483,7 +5579,7 @@ void nft_data_uninit(const struct nft_data *data, enum nft_data_types type)
WARN_ON(1);
}
}
-EXPORT_SYMBOL_GPL(nft_data_uninit);
+EXPORT_SYMBOL_GPL(nft_data_release);
int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
enum nft_data_types type, unsigned int len)
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 950bf6eadc65..be678a323598 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -686,6 +686,7 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
tuple_set = true;
}
+ ret = -ENOENT;
list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) {
cur = &nlcth->helper;
j++;
@@ -699,16 +700,20 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
tuple.dst.protonum != cur->tuple.dst.protonum))
continue;
- found = true;
- nf_conntrack_helper_unregister(cur);
- kfree(cur->expect_policy);
+ if (refcount_dec_if_one(&cur->refcnt)) {
+ found = true;
+ nf_conntrack_helper_unregister(cur);
+ kfree(cur->expect_policy);
- list_del(&nlcth->list);
- kfree(nlcth);
+ list_del(&nlcth->list);
+ kfree(nlcth);
+ } else {
+ ret = -EBUSY;
+ }
}
/* Make sure we return success if we flush and there is no helpers */
- return (found || j == 0) ? 0 : -ENOENT;
+ return (found || j == 0) ? 0 : ret;
}
static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = {
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 877d9acd91ef..fff8073e2a56 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -83,17 +83,26 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
tb[NFTA_BITWISE_MASK]);
if (err < 0)
return err;
- if (d1.len != priv->len)
- return -EINVAL;
+ if (d1.len != priv->len) {
+ err = -EINVAL;
+ goto err1;
+ }
err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2,
tb[NFTA_BITWISE_XOR]);
if (err < 0)
- return err;
- if (d2.len != priv->len)
- return -EINVAL;
+ goto err1;
+ if (d2.len != priv->len) {
+ err = -EINVAL;
+ goto err2;
+ }
return 0;
+err2:
+ nft_data_release(&priv->xor, d2.type);
+err1:
+ nft_data_release(&priv->mask, d1.type);
+ return err;
}
static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 2b96effeadc1..c2945eb3397c 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -201,10 +201,18 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
if (err < 0)
return ERR_PTR(err);
+ if (desc.type != NFT_DATA_VALUE) {
+ err = -EINVAL;
+ goto err1;
+ }
+
if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ)
return &nft_cmp_fast_ops;
- else
- return &nft_cmp_ops;
+
+ return &nft_cmp_ops;
+err1:
+ nft_data_release(&data, desc.type);
+ return ERR_PTR(-EINVAL);
}
struct nft_expr_type nft_cmp_type __read_mostly = {
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index a34ceb38fc55..1678e9e75e8e 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -826,9 +826,9 @@ static void nft_ct_helper_obj_destroy(struct nft_object *obj)
struct nft_ct_helper_obj *priv = nft_obj_data(obj);
if (priv->helper4)
- module_put(priv->helper4->me);
+ nf_conntrack_helper_put(priv->helper4);
if (priv->helper6)
- module_put(priv->helper6->me);
+ nf_conntrack_helper_put(priv->helper6);
}
static void nft_ct_helper_obj_eval(struct nft_object *obj,
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 728baf88295a..4717d7796927 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -65,7 +65,7 @@ static int nft_immediate_init(const struct nft_ctx *ctx,
return 0;
err1:
- nft_data_uninit(&priv->data, desc.type);
+ nft_data_release(&priv->data, desc.type);
return err;
}
@@ -73,7 +73,8 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
- return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg));
+
+ return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg));
}
static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index 9edc74eedc10..cedb96c3619f 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -102,9 +102,9 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
priv->len = desc_from.len;
return 0;
err2:
- nft_data_uninit(&priv->data_to, desc_to.type);
+ nft_data_release(&priv->data_to, desc_to.type);
err1:
- nft_data_uninit(&priv->data_from, desc_from.type);
+ nft_data_release(&priv->data_from, desc_from.type);
return err;
}
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 8ec086b6b56b..3d3a6df4ce70 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -222,7 +222,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_set_elem elem;
int err;
- err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
+ err = rhashtable_walk_init(&priv->ht, &hti, GFP_ATOMIC);
iter->err = err;
if (err)
return;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index f134d384852f..1770c1d9b37f 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -283,28 +283,30 @@ static int xt_obj_to_user(u16 __user *psize, u16 size,
&U->u.user.revision, K->u.kernel.TYPE->revision)
int xt_data_to_user(void __user *dst, const void *src,
- int usersize, int size)
+ int usersize, int size, int aligned_size)
{
usersize = usersize ? : size;
if (copy_to_user(dst, src, usersize))
return -EFAULT;
- if (usersize != size && clear_user(dst + usersize, size - usersize))
+ if (usersize != aligned_size &&
+ clear_user(dst + usersize, aligned_size - usersize))
return -EFAULT;
return 0;
}
EXPORT_SYMBOL_GPL(xt_data_to_user);
-#define XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \
+#define XT_DATA_TO_USER(U, K, TYPE) \
xt_data_to_user(U->data, K->data, \
K->u.kernel.TYPE->usersize, \
- C_SIZE ? : K->u.kernel.TYPE->TYPE##size)
+ K->u.kernel.TYPE->TYPE##size, \
+ XT_ALIGN(K->u.kernel.TYPE->TYPE##size))
int xt_match_to_user(const struct xt_entry_match *m,
struct xt_entry_match __user *u)
{
return XT_OBJ_TO_USER(u, m, match, 0) ||
- XT_DATA_TO_USER(u, m, match, 0);
+ XT_DATA_TO_USER(u, m, match);
}
EXPORT_SYMBOL_GPL(xt_match_to_user);
@@ -312,7 +314,7 @@ int xt_target_to_user(const struct xt_entry_target *t,
struct xt_entry_target __user *u)
{
return XT_OBJ_TO_USER(u, t, target, 0) ||
- XT_DATA_TO_USER(u, t, target, 0);
+ XT_DATA_TO_USER(u, t, target);
}
EXPORT_SYMBOL_GPL(xt_target_to_user);
@@ -611,6 +613,12 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
}
EXPORT_SYMBOL_GPL(xt_compat_match_from_user);
+#define COMPAT_XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \
+ xt_data_to_user(U->data, K->data, \
+ K->u.kernel.TYPE->usersize, \
+ C_SIZE, \
+ COMPAT_XT_ALIGN(C_SIZE))
+
int xt_compat_match_to_user(const struct xt_entry_match *m,
void __user **dstptr, unsigned int *size)
{
@@ -626,7 +634,7 @@ int xt_compat_match_to_user(const struct xt_entry_match *m,
if (match->compat_to_user((void __user *)cm->data, m->data))
return -EFAULT;
} else {
- if (XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm)))
+ if (COMPAT_XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm)))
return -EFAULT;
}
@@ -763,17 +771,8 @@ EXPORT_SYMBOL(xt_check_entry_offsets);
*/
unsigned int *xt_alloc_entry_offsets(unsigned int size)
{
- unsigned int *off;
-
- off = kcalloc(size, sizeof(unsigned int), GFP_KERNEL | __GFP_NOWARN);
+ return kvmalloc_array(size, sizeof(unsigned int), GFP_KERNEL | __GFP_ZERO);
- if (off)
- return off;
-
- if (size < (SIZE_MAX / sizeof(unsigned int)))
- off = vmalloc(size * sizeof(unsigned int));
-
- return off;
}
EXPORT_SYMBOL(xt_alloc_entry_offsets);
@@ -981,7 +980,7 @@ int xt_compat_target_to_user(const struct xt_entry_target *t,
if (target->compat_to_user((void __user *)ct->data, t->data))
return -EFAULT;
} else {
- if (XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct)))
+ if (COMPAT_XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct)))
return -EFAULT;
}
@@ -1007,8 +1006,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
if (!info) {
- info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN |
- __GFP_NORETRY | __GFP_HIGHMEM,
+ info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
PAGE_KERNEL);
if (!info)
return NULL;
@@ -1116,7 +1114,7 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
size = sizeof(void **) * nr_cpu_ids;
if (size > PAGE_SIZE)
- i->jumpstack = vzalloc(size);
+ i->jumpstack = kvzalloc(size, GFP_KERNEL);
else
i->jumpstack = kzalloc(size, GFP_KERNEL);
if (i->jumpstack == NULL)
@@ -1138,12 +1136,8 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
*/
size = sizeof(void *) * i->stacksize * 2u;
for_each_possible_cpu(cpu) {
- if (size > PAGE_SIZE)
- i->jumpstack[cpu] = vmalloc_node(size,
- cpu_to_node(cpu));
- else
- i->jumpstack[cpu] = kmalloc_node(size,
- GFP_KERNEL, cpu_to_node(cpu));
+ i->jumpstack[cpu] = kvmalloc_node(size, GFP_KERNEL,
+ cpu_to_node(cpu));
if (i->jumpstack[cpu] == NULL)
/*
* Freeing will be done later on by the callers. The
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index bb7ad82dcd56..623ef37de886 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -96,7 +96,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL);
if (help == NULL) {
- module_put(helper->me);
+ nf_conntrack_helper_put(helper);
return -ENOMEM;
}
@@ -263,7 +263,7 @@ out:
err4:
help = nfct_help(ct);
if (help)
- module_put(help->helper->me);
+ nf_conntrack_helper_put(help->helper);
err3:
nf_ct_tmpl_free(ct);
err2:
@@ -346,7 +346,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par,
if (ct) {
help = nfct_help(ct);
if (help)
- module_put(help->helper->me);
+ nf_conntrack_helper_put(help->helper);
nf_ct_netns_put(par->net, par->family);
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 37d581a31cff..3f6c4fa78bdb 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -388,10 +388,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
}
sz = sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size;
- if (sz <= PAGE_SIZE)
- t = kzalloc(sz, GFP_KERNEL);
- else
- t = vzalloc(sz);
+ t = kvzalloc(sz, GFP_KERNEL);
if (t == NULL) {
ret = -ENOMEM;
goto out;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index bf602e33c40a..08679ebb3068 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1123,7 +1123,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL);
if (!help) {
- module_put(helper->me);
+ nf_conntrack_helper_put(helper);
return -ENOMEM;
}
@@ -1584,7 +1584,7 @@ void ovs_ct_free_action(const struct nlattr *a)
static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
{
if (ct_info->helper)
- module_put(ct_info->helper->me);
+ nf_conntrack_helper_put(ct_info->helper);
if (ct_info->ct)
nf_ct_tmpl_free(ct_info->ct);
}
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f4001763134d..e3eeed19cc7a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2658,13 +2658,6 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
}
- sockc.tsflags = po->sk.sk_tsflags;
- if (msg->msg_controllen) {
- err = sock_cmsg_send(&po->sk, msg, &sockc);
- if (unlikely(err))
- goto out;
- }
-
err = -ENXIO;
if (unlikely(dev == NULL))
goto out;
@@ -2672,6 +2665,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
if (unlikely(!(dev->flags & IFF_UP)))
goto out_put;
+ sockc.tsflags = po->sk.sk_tsflags;
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(&po->sk, msg, &sockc);
+ if (unlikely(err))
+ goto out_put;
+ }
+
if (po->sk.sk_socket->type == SOCK_RAW)
reserve = dev->hard_header_len;
size_max = po->tx_ring.frame_size
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index dee469fed967..51859b8edd7e 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -203,7 +203,6 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
*arg = (unsigned long) head;
rcu_assign_pointer(tp->root, new);
- call_rcu(&head->rcu, mall_destroy_rcu);
return 0;
err_replace_hw_filter:
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index bbe57d57b67f..e88342fde1bc 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1831,6 +1831,12 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
if (!qdisc_dev(root))
return 0;
+ if (tcm->tcm_parent) {
+ q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
+ if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
+ return -1;
+ return 0;
+ }
hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
return -1;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index d00f4c7c2f3a..b30a2c70bd48 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -376,10 +376,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
if (mask != q->tab_mask) {
struct sk_buff **ntab;
- ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
- GFP_KERNEL | __GFP_NOWARN);
- if (!ntab)
- ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+ ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO);
if (!ntab)
return -ENOMEM;
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index da4f67bda0ee..b488721a0059 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -624,16 +624,6 @@ static void fq_rehash(struct fq_sched_data *q,
q->stat_gc_flows += fcnt;
}
-static void *fq_alloc_node(size_t sz, int node)
-{
- void *ptr;
-
- ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node);
- if (!ptr)
- ptr = vmalloc_node(sz, node);
- return ptr;
-}
-
static void fq_free(void *addr)
{
kvfree(addr);
@@ -650,7 +640,7 @@ static int fq_resize(struct Qdisc *sch, u32 log)
return 0;
/* If XPS was setup, we can allocate memory on right NUMA node */
- array = fq_alloc_node(sizeof(struct rb_root) << log,
+ array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_REPEAT,
netdev_queue_numa_node_read(sch->dev_queue));
if (!array)
return -ENOMEM;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 18bbb5476c83..9201abce928c 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -446,27 +446,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
return 0;
}
-static void *fq_codel_zalloc(size_t sz)
-{
- void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
-
- if (!ptr)
- ptr = vzalloc(sz);
- return ptr;
-}
-
-static void fq_codel_free(void *addr)
-{
- kvfree(addr);
-}
-
static void fq_codel_destroy(struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
tcf_destroy_chain(&q->filter_list);
- fq_codel_free(q->backlogs);
- fq_codel_free(q->flows);
+ kvfree(q->backlogs);
+ kvfree(q->flows);
}
static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
@@ -493,13 +479,13 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
}
if (!q->flows) {
- q->flows = fq_codel_zalloc(q->flows_cnt *
- sizeof(struct fq_codel_flow));
+ q->flows = kvzalloc(q->flows_cnt *
+ sizeof(struct fq_codel_flow), GFP_KERNEL);
if (!q->flows)
return -ENOMEM;
- q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32));
+ q->backlogs = kvzalloc(q->flows_cnt * sizeof(u32), GFP_KERNEL);
if (!q->backlogs) {
- fq_codel_free(q->flows);
+ kvfree(q->flows);
return -ENOMEM;
}
for (i = 0; i < q->flows_cnt; i++) {
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index c19d346e6c5a..51d3ba682af9 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -467,29 +467,14 @@ static void hhf_reset(struct Qdisc *sch)
rtnl_kfree_skbs(skb, skb);
}
-static void *hhf_zalloc(size_t sz)
-{
- void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
-
- if (!ptr)
- ptr = vzalloc(sz);
-
- return ptr;
-}
-
-static void hhf_free(void *addr)
-{
- kvfree(addr);
-}
-
static void hhf_destroy(struct Qdisc *sch)
{
int i;
struct hhf_sched_data *q = qdisc_priv(sch);
for (i = 0; i < HHF_ARRAYS_CNT; i++) {
- hhf_free(q->hhf_arrays[i]);
- hhf_free(q->hhf_valid_bits[i]);
+ kvfree(q->hhf_arrays[i]);
+ kvfree(q->hhf_valid_bits[i]);
}
for (i = 0; i < HH_FLOWS_CNT; i++) {
@@ -503,7 +488,7 @@ static void hhf_destroy(struct Qdisc *sch)
kfree(flow);
}
}
- hhf_free(q->hh_flows);
+ kvfree(q->hh_flows);
}
static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
@@ -609,8 +594,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
if (!q->hh_flows) {
/* Initialize heavy-hitter flow table. */
- q->hh_flows = hhf_zalloc(HH_FLOWS_CNT *
- sizeof(struct list_head));
+ q->hh_flows = kvzalloc(HH_FLOWS_CNT *
+ sizeof(struct list_head), GFP_KERNEL);
if (!q->hh_flows)
return -ENOMEM;
for (i = 0; i < HH_FLOWS_CNT; i++)
@@ -624,8 +609,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
/* Initialize heavy-hitter filter arrays. */
for (i = 0; i < HHF_ARRAYS_CNT; i++) {
- q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
- sizeof(u32));
+ q->hhf_arrays[i] = kvzalloc(HHF_ARRAYS_LEN *
+ sizeof(u32), GFP_KERNEL);
if (!q->hhf_arrays[i]) {
/* Note: hhf_destroy() will be called
* by our caller.
@@ -637,8 +622,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
/* Initialize valid bits of heavy-hitter filter arrays. */
for (i = 0; i < HHF_ARRAYS_CNT; i++) {
- q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
- BITS_PER_BYTE);
+ q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN /
+ BITS_PER_BYTE, GFP_KERNEL);
if (!q->hhf_valid_bits[i]) {
/* Note: hhf_destroy() will be called
* by our caller.
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index f0ce4780f395..1b3dd6190e93 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -702,15 +702,11 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
spinlock_t *root_lock;
struct disttable *d;
int i;
- size_t s;
if (n > NETEM_DIST_MAX)
return -EINVAL;
- s = sizeof(struct disttable) + n * sizeof(s16);
- d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
- if (!d)
- d = vmalloc(s);
+ d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
if (!d)
return -ENOMEM;
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index b00e02c139de..332d94be6e1c 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -685,11 +685,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
static void *sfq_alloc(size_t sz)
{
- void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
-
- if (!ptr)
- ptr = vmalloc(sz);
- return ptr;
+ return kvmalloc(sz, GFP_KERNEL);
}
static void sfq_free(void *addr)
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index a9708da28eb5..95238284c422 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1176,7 +1176,9 @@ void sctp_assoc_update(struct sctp_association *asoc,
asoc->ctsn_ack_point = asoc->next_tsn - 1;
asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
- if (!asoc->stream) {
+
+ if (sctp_state(asoc, COOKIE_WAIT)) {
+ sctp_stream_free(asoc->stream);
asoc->stream = new->stream;
new->stream = NULL;
}
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 961ee59f696a..f5b45b8b8b16 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -240,12 +240,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
struct sctp_bind_addr *bp;
struct ipv6_pinfo *np = inet6_sk(sk);
struct sctp_sockaddr_entry *laddr;
- union sctp_addr *baddr = NULL;
union sctp_addr *daddr = &t->ipaddr;
union sctp_addr dst_saddr;
struct in6_addr *final_p, final;
__u8 matchlen = 0;
- __u8 bmatchlen;
sctp_scope_t scope;
memset(fl6, 0, sizeof(struct flowi6));
@@ -312,23 +310,37 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
*/
rcu_read_lock();
list_for_each_entry_rcu(laddr, &bp->address_list, list) {
- if (!laddr->valid)
+ struct dst_entry *bdst;
+ __u8 bmatchlen;
+
+ if (!laddr->valid ||
+ laddr->state != SCTP_ADDR_SRC ||
+ laddr->a.sa.sa_family != AF_INET6 ||
+ scope > sctp_scope(&laddr->a))
continue;
- if ((laddr->state == SCTP_ADDR_SRC) &&
- (laddr->a.sa.sa_family == AF_INET6) &&
- (scope <= sctp_scope(&laddr->a))) {
- bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
- if (!baddr || (matchlen < bmatchlen)) {
- baddr = &laddr->a;
- matchlen = bmatchlen;
- }
- }
- }
- if (baddr) {
- fl6->saddr = baddr->v6.sin6_addr;
- fl6->fl6_sport = baddr->v6.sin6_port;
+
+ fl6->saddr = laddr->a.v6.sin6_addr;
+ fl6->fl6_sport = laddr->a.v6.sin6_port;
final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
- dst = ip6_dst_lookup_flow(sk, fl6, final_p);
+ bdst = ip6_dst_lookup_flow(sk, fl6, final_p);
+
+ if (!IS_ERR(bdst) &&
+ ipv6_chk_addr(dev_net(bdst->dev),
+ &laddr->a.v6.sin6_addr, bdst->dev, 1)) {
+ if (!IS_ERR_OR_NULL(dst))
+ dst_release(dst);
+ dst = bdst;
+ break;
+ }
+
+ bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
+ if (matchlen > bmatchlen)
+ continue;
+
+ if (!IS_ERR_OR_NULL(dst))
+ dst_release(dst);
+ dst = bdst;
+ matchlen = bmatchlen;
}
rcu_read_unlock();
@@ -665,6 +677,9 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
newnp = inet6_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+ newnp->ipv6_mc_list = NULL;
+ newnp->ipv6_ac_list = NULL;
+ newnp->ipv6_fl_list = NULL;
rcu_read_lock();
opt = rcu_dereference(np->opt);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 8a08f13469c4..92e332e17391 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2454,16 +2454,11 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
* stream sequence number shall be set to 0.
*/
- /* Allocate storage for the negotiated streams if it is not a temporary
- * association.
- */
- if (!asoc->temp) {
- if (sctp_stream_init(asoc, gfp))
- goto clean_up;
+ if (sctp_stream_init(asoc, gfp))
+ goto clean_up;
- if (sctp_assoc_set_id(asoc, gfp))
- goto clean_up;
- }
+ if (!asoc->temp && sctp_assoc_set_id(asoc, gfp))
+ goto clean_up;
/* ADDIP Section 4.1 ASCONF Chunk Procedures
*
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 4f5e6cfc7f60..f863b5573e42 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2088,6 +2088,9 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupcook(struct net *net,
}
}
+ /* Set temp so that it won't be added into hashtable */
+ new_asoc->temp = 1;
+
/* Compare the tie_tag in cookie with the verification tag of
* current association.
*/
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
index c717ef0896aa..33954852f3f8 100644
--- a/net/smc/Kconfig
+++ b/net/smc/Kconfig
@@ -8,6 +8,10 @@ config SMC
The Linux implementation of the SMC-R solution is designed as
a separate socket family SMC.
+ Warning: SMC will expose all memory for remote reads and writes
+ once a connection is established. Don't enable this option except
+ for tightly controlled lab environment.
+
Select this option if you want to run SMC socket applications
config SMC_DIAG
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5b6ee21368a6..6793d7348cc8 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -101,7 +101,7 @@ struct proto smc_proto = {
.unhash = smc_unhash_sk,
.obj_size = sizeof(struct smc_sock),
.h.smc_hash = &smc_v4_hashinfo,
- .slab_flags = SLAB_DESTROY_BY_RCU,
+ .slab_flags = SLAB_TYPESAFE_BY_RCU,
};
EXPORT_SYMBOL_GPL(smc_proto);
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index e41f594a1e1d..03ec058d18df 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -204,7 +204,7 @@ int smc_clc_send_confirm(struct smc_sock *smc)
memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
hton24(cclc.qpn, link->roce_qp->qp_num);
cclc.rmb_rkey =
- htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]);
cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
cclc.rmbe_alert_token = htonl(conn->alert_token_local);
cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
@@ -256,7 +256,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
hton24(aclc.qpn, link->roce_qp->qp_num);
aclc.rmb_rkey =
- htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]);
aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
aclc.rmbe_alert_token = htonl(conn->alert_token_local);
aclc.qp_mtu = link->path_mtu;
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 65020e93ff21..3ac09a629ea1 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -613,19 +613,8 @@ int smc_rmb_create(struct smc_sock *smc)
rmb_desc = NULL;
continue; /* if mapping failed, try smaller one */
}
- rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd,
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_LOCAL_WRITE,
- &rmb_desc->mr_rx[SMC_SINGLE_LINK]);
- if (rc) {
- smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
- tmp_bufsize, rmb_desc,
- DMA_FROM_DEVICE);
- kfree(rmb_desc->cpu_addr);
- kfree(rmb_desc);
- rmb_desc = NULL;
- continue;
- }
+ rmb_desc->rkey[SMC_SINGLE_LINK] =
+ lgr->lnk[SMC_SINGLE_LINK].roce_pd->unsafe_global_rkey;
rmb_desc->used = 1;
write_lock_bh(&lgr->rmbs_lock);
list_add(&rmb_desc->list,
@@ -668,6 +657,7 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn,
for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
+ (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) &&
test_bit(i, lgr->rtokens_used_mask)) {
conn->rtoken_idx = i;
return 0;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 27eb38056a27..b013cb43a327 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -93,7 +93,7 @@ struct smc_buf_desc {
u64 dma_addr[SMC_LINKS_PER_LGR_MAX];
/* mapped address of buffer */
void *cpu_addr; /* virtual address of buffer */
- struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
+ u32 rkey[SMC_LINKS_PER_LGR_MAX];
/* for rmb only:
* rkey provided to peer
*/
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index cb69ab977cd7..b31715505a35 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -37,24 +37,6 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system
* identifier
*/
-int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
- struct ib_mr **mr)
-{
- int rc;
-
- if (*mr)
- return 0; /* already done */
-
- /* obtain unique key -
- * next invocation of get_dma_mr returns a different key!
- */
- *mr = pd->device->get_dma_mr(pd, access_flags);
- rc = PTR_ERR_OR_ZERO(*mr);
- if (IS_ERR(*mr))
- *mr = NULL;
- return rc;
-}
-
static int smc_ib_modify_qp_init(struct smc_link *lnk)
{
struct ib_qp_attr qp_attr;
@@ -210,7 +192,8 @@ int smc_ib_create_protection_domain(struct smc_link *lnk)
{
int rc;
- lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
+ lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev,
+ IB_PD_UNSAFE_GLOBAL_RKEY);
rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
if (IS_ERR(lnk->roce_pd))
lnk->roce_pd = NULL;
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index 7e1f0e24d177..b567152a526d 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -61,8 +61,6 @@ void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
int smc_ib_create_protection_domain(struct smc_link *lnk);
void smc_ib_destroy_queue_pair(struct smc_link *lnk);
int smc_ib_create_queue_pair(struct smc_link *lnk);
-int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
- struct ib_mr **mr);
int smc_ib_ready_link(struct smc_link *lnk);
int smc_ib_modify_qp_rts(struct smc_link *lnk);
int smc_ib_modify_qp_reset(struct smc_link *lnk);
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 04ce2c0b660e..ac09ca803296 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -52,6 +52,7 @@ config SUNRPC_XPRT_RDMA
tristate "RPC-over-RDMA transport"
depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
default SUNRPC && INFINIBAND
+ select SG_POOL
help
This option allows the NFS client and server to use RDMA
transports (InfiniBand, iWARP, or RoCE).
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 52da3ce54bb5..b5cb921775a0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1042,8 +1042,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
struct rpc_task *task;
task = rpc_new_task(task_setup_data);
- if (IS_ERR(task))
- goto out;
rpc_task_set_client(task, task_setup_data->rpc_client);
rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
@@ -1053,7 +1051,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
atomic_inc(&task->tk_count);
rpc_execute(task);
-out:
return task;
}
EXPORT_SYMBOL_GPL(rpc_run_task);
@@ -1140,10 +1137,6 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
* Create an rpc_task to send the data
*/
task = rpc_new_task(&task_setup_data);
- if (IS_ERR(task)) {
- xprt_free_bc_request(req);
- goto out;
- }
task->tk_rqstp = req;
/*
@@ -1158,7 +1151,6 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
WARN_ON_ONCE(atomic_read(&task->tk_count) != 2);
rpc_execute(task);
-out:
dprintk("RPC: rpc_run_bc_task: task= %p\n", task);
return task;
}
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 5db68b371db2..0cc83839c13c 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -965,11 +965,6 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data)
if (task == NULL) {
task = rpc_alloc_task();
- if (task == NULL) {
- rpc_release_calldata(setup_data->callback_ops,
- setup_data->callback_data);
- return ERR_PTR(-ENOMEM);
- }
flags = RPC_TASK_DYNAMIC;
}
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a08aeb56b8e4..bc0f5a0ecbdc 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -702,59 +702,32 @@ found_pool:
return task;
}
-/*
- * Create or destroy enough new threads to make the number
- * of threads the given number. If `pool' is non-NULL, applies
- * only to threads in that pool, otherwise round-robins between
- * all pools. Caller must ensure that mutual exclusion between this and
- * server startup or shutdown.
- *
- * Destroying threads relies on the service threads filling in
- * rqstp->rq_task, which only the nfs ones do. Assumes the serv
- * has been created using svc_create_pooled().
- *
- * Based on code that used to be in nfsd_svc() but tweaked
- * to be pool-aware.
- */
-int
-svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+/* create new threads */
+static int
+svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
struct svc_rqst *rqstp;
struct task_struct *task;
struct svc_pool *chosen_pool;
- int error = 0;
unsigned int state = serv->sv_nrthreads-1;
int node;
- if (pool == NULL) {
- /* The -1 assumes caller has done a svc_get() */
- nrservs -= (serv->sv_nrthreads-1);
- } else {
- spin_lock_bh(&pool->sp_lock);
- nrservs -= pool->sp_nrthreads;
- spin_unlock_bh(&pool->sp_lock);
- }
-
- /* create new threads */
- while (nrservs > 0) {
+ do {
nrservs--;
chosen_pool = choose_pool(serv, pool, &state);
node = svc_pool_map_get_node(chosen_pool->sp_id);
rqstp = svc_prepare_thread(serv, chosen_pool, node);
- if (IS_ERR(rqstp)) {
- error = PTR_ERR(rqstp);
- break;
- }
+ if (IS_ERR(rqstp))
+ return PTR_ERR(rqstp);
__module_get(serv->sv_ops->svo_module);
task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
node, "%s", serv->sv_name);
if (IS_ERR(task)) {
- error = PTR_ERR(task);
module_put(serv->sv_ops->svo_module);
svc_exit_thread(rqstp);
- break;
+ return PTR_ERR(task);
}
rqstp->rq_task = task;
@@ -763,18 +736,103 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
svc_sock_update_bufs(serv);
wake_up_process(task);
- }
+ } while (nrservs > 0);
+
+ return 0;
+}
+
+
+/* destroy old threads */
+static int
+svc_signal_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+ struct task_struct *task;
+ unsigned int state = serv->sv_nrthreads-1;
+
/* destroy old threads */
- while (nrservs < 0 &&
- (task = choose_victim(serv, pool, &state)) != NULL) {
+ do {
+ task = choose_victim(serv, pool, &state);
+ if (task == NULL)
+ break;
send_sig(SIGINT, task, 1);
nrservs++;
+ } while (nrservs < 0);
+
+ return 0;
+}
+
+/*
+ * Create or destroy enough new threads to make the number
+ * of threads the given number. If `pool' is non-NULL, applies
+ * only to threads in that pool, otherwise round-robins between
+ * all pools. Caller must ensure that mutual exclusion between this and
+ * server startup or shutdown.
+ *
+ * Destroying threads relies on the service threads filling in
+ * rqstp->rq_task, which only the nfs ones do. Assumes the serv
+ * has been created using svc_create_pooled().
+ *
+ * Based on code that used to be in nfsd_svc() but tweaked
+ * to be pool-aware.
+ */
+int
+svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+ if (pool == NULL) {
+ /* The -1 assumes caller has done a svc_get() */
+ nrservs -= (serv->sv_nrthreads-1);
+ } else {
+ spin_lock_bh(&pool->sp_lock);
+ nrservs -= pool->sp_nrthreads;
+ spin_unlock_bh(&pool->sp_lock);
}
- return error;
+ if (nrservs > 0)
+ return svc_start_kthreads(serv, pool, nrservs);
+ if (nrservs < 0)
+ return svc_signal_kthreads(serv, pool, nrservs);
+ return 0;
}
EXPORT_SYMBOL_GPL(svc_set_num_threads);
+/* destroy old threads */
+static int
+svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+ struct task_struct *task;
+ unsigned int state = serv->sv_nrthreads-1;
+
+ /* destroy old threads */
+ do {
+ task = choose_victim(serv, pool, &state);
+ if (task == NULL)
+ break;
+ kthread_stop(task);
+ nrservs++;
+ } while (nrservs < 0);
+ return 0;
+}
+
+int
+svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+ if (pool == NULL) {
+ /* The -1 assumes caller has done a svc_get() */
+ nrservs -= (serv->sv_nrthreads-1);
+ } else {
+ spin_lock_bh(&pool->sp_lock);
+ nrservs -= pool->sp_nrthreads;
+ spin_unlock_bh(&pool->sp_lock);
+ }
+
+ if (nrservs > 0)
+ return svc_start_kthreads(serv, pool, nrservs);
+ if (nrservs < 0)
+ return svc_stop_kthreads(serv, pool, nrservs);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(svc_set_num_threads_sync);
+
/*
* Called from a server thread as it's exiting. Caller must hold the "service
* mutex" for the service.
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 1f7082144e01..e34f4ee7f2b6 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -807,7 +807,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
EXPORT_SYMBOL_GPL(xdr_init_decode);
/**
- * xdr_init_decode - Initialize an xdr_stream for decoding data.
+ * xdr_init_decode_pages - Initialize an xdr_stream for decoding into pages
* @xdr: pointer to xdr_stream struct
* @buf: pointer to XDR buffer from which to decode data
* @pages: list of pages to decode into
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index b530a2852ba8..3e63c5e97ebe 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -651,6 +651,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
xprt_wake_pending_tasks(xprt, -EAGAIN);
spin_unlock_bh(&xprt->transport_lock);
}
+EXPORT_SYMBOL_GPL(xprt_force_disconnect);
/**
* xprt_conditional_disconnect - force a transport to disconnect
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index ef19fa42c50f..c1ae8142ab73 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -4,5 +4,5 @@ rpcrdma-y := transport.o rpc_rdma.o verbs.o \
fmr_ops.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
- module.o
+ svc_rdma_rw.o module.o
rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index a044be2d6ad7..694e9b13ecf0 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -494,7 +494,7 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
}
sge->length = len;
- ib_dma_sync_single_for_device(ia->ri_device, sge->addr,
+ ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr,
sge->length, DMA_TO_DEVICE);
req->rl_send_wr.num_sge++;
return true;
@@ -523,7 +523,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
sge[sge_no].addr = rdmab_addr(rb);
sge[sge_no].length = xdr->head[0].iov_len;
sge[sge_no].lkey = rdmab_lkey(rb);
- ib_dma_sync_single_for_device(device, sge[sge_no].addr,
+ ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr,
sge[sge_no].length, DMA_TO_DEVICE);
/* If there is a Read chunk, the page list is being handled
@@ -781,9 +781,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
return 0;
out_err:
- pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n",
- PTR_ERR(iptr));
- r_xprt->rx_stats.failed_marshal_count++;
+ if (PTR_ERR(iptr) != -ENOBUFS) {
+ pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n",
+ PTR_ERR(iptr));
+ r_xprt->rx_stats.failed_marshal_count++;
+ }
return PTR_ERR(iptr);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index c846ca9f1eba..a4a8f6989ee7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -58,9 +58,9 @@ unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
static unsigned int min_max_requests = 4;
static unsigned int max_max_requests = 16384;
-unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
-static unsigned int min_max_inline = 4096;
-static unsigned int max_max_inline = 65536;
+unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
atomic_t rdma_stat_recv;
atomic_t rdma_stat_read;
@@ -247,8 +247,6 @@ int svc_rdma_init(void)
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
dprintk("\tmax_requests : %u\n", svcrdma_max_requests);
- dprintk("\tsq_depth : %u\n",
- svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests);
dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index ff1df40f0d26..c676ed0efb5a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -12,7 +12,17 @@
#undef SVCRDMA_BACKCHANNEL_DEBUG
-int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
+/**
+ * svc_rdma_handle_bc_reply - Process incoming backchannel reply
+ * @xprt: controlling backchannel transport
+ * @rdma_resp: pointer to incoming transport header
+ * @rcvbuf: XDR buffer into which to decode the reply
+ *
+ * Returns:
+ * %0 if @rcvbuf is filled in, xprt_complete_rqst called,
+ * %-EAGAIN if server should call ->recvfrom again.
+ */
+int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
struct xdr_buf *rcvbuf)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
@@ -27,13 +37,13 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
p = (__be32 *)src->iov_base;
len = src->iov_len;
- xid = rmsgp->rm_xid;
+ xid = *rdma_resp;
#ifdef SVCRDMA_BACKCHANNEL_DEBUG
pr_info("%s: xid=%08x, length=%zu\n",
__func__, be32_to_cpu(xid), len);
pr_info("%s: RPC/RDMA: %*ph\n",
- __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
+ __func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp);
pr_info("%s: RPC: %*ph\n",
__func__, (int)len, p);
#endif
@@ -53,7 +63,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
goto out_unlock;
memcpy(dst->iov_base, p, len);
- credits = be32_to_cpu(rmsgp->rm_credit);
+ credits = be32_to_cpup(rdma_resp + 2);
if (credits == 0)
credits = 1; /* don't deadlock */
else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
@@ -90,9 +100,9 @@ out_notfound:
* Caller holds the connection's mutex and has already marshaled
* the RPC/RDMA request.
*
- * This is similar to svc_rdma_reply, but takes an rpc_rqst
- * instead, does not support chunks, and avoids blocking memory
- * allocation.
+ * This is similar to svc_rdma_send_reply_msg, but takes a struct
+ * rpc_rqst instead, does not support chunks, and avoids blocking
+ * memory allocation.
*
* XXX: There is still an opportunity to block in svc_rdma_send()
* if there are no SQ entries to post the Send. This may occur if
@@ -101,59 +111,36 @@ out_notfound:
static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
struct rpc_rqst *rqst)
{
- struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
struct svc_rdma_op_ctxt *ctxt;
- struct svc_rdma_req_map *vec;
- struct ib_send_wr send_wr;
int ret;
- vec = svc_rdma_get_req_map(rdma);
- ret = svc_rdma_map_xdr(rdma, sndbuf, vec, false);
- if (ret)
+ ctxt = svc_rdma_get_context(rdma);
+
+ /* rpcrdma_bc_send_request builds the transport header and
+ * the backchannel RPC message in the same buffer. Thus only
+ * one SGE is needed to send both.
+ */
+ ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer,
+ rqst->rq_snd_buf.len);
+ if (ret < 0)
goto out_err;
ret = svc_rdma_repost_recv(rdma, GFP_NOIO);
if (ret)
goto out_err;
- ctxt = svc_rdma_get_context(rdma);
- ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
- ctxt->count = 1;
-
- ctxt->direction = DMA_TO_DEVICE;
- ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->sge[0].length = sndbuf->len;
- ctxt->sge[0].addr =
- ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
- sndbuf->len, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
- ret = -EIO;
- goto out_unmap;
- }
- svc_rdma_count_mappings(rdma, ctxt);
-
- memset(&send_wr, 0, sizeof(send_wr));
- ctxt->cqe.done = svc_rdma_wc_send;
- send_wr.wr_cqe = &ctxt->cqe;
- send_wr.sg_list = ctxt->sge;
- send_wr.num_sge = 1;
- send_wr.opcode = IB_WR_SEND;
- send_wr.send_flags = IB_SEND_SIGNALED;
-
- ret = svc_rdma_send(rdma, &send_wr);
- if (ret) {
- ret = -EIO;
+ ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0);
+ if (ret)
goto out_unmap;
- }
out_err:
- svc_rdma_put_req_map(rdma, vec);
dprintk("svcrdma: %s returns %d\n", __func__, ret);
return ret;
out_unmap:
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 1);
+ ret = -EIO;
goto out_err;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 1c4aabf0f657..bdcf7d85a3dc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -166,92 +166,3 @@ out_inval:
dprintk("svcrdma: failed to parse transport header\n");
return -EINVAL;
}
-
-int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
- struct rpcrdma_msg *rmsgp,
- enum rpcrdma_errcode err, __be32 *va)
-{
- __be32 *startp = va;
-
- *va++ = rmsgp->rm_xid;
- *va++ = rmsgp->rm_vers;
- *va++ = xprt->sc_fc_credits;
- *va++ = rdma_error;
- *va++ = cpu_to_be32(err);
- if (err == ERR_VERS) {
- *va++ = rpcrdma_version;
- *va++ = rpcrdma_version;
- }
-
- return (int)((unsigned long)va - (unsigned long)startp);
-}
-
-/**
- * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
- * @rdma_resp: buffer containing Reply transport header
- *
- * Returns length of transport header, in bytes.
- */
-unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
-{
- unsigned int nsegs;
- __be32 *p;
-
- p = rdma_resp;
-
- /* RPC-over-RDMA V1 replies never have a Read list. */
- p += rpcrdma_fixed_maxsz + 1;
-
- /* Skip Write list. */
- while (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
- }
-
- /* Skip Reply chunk. */
- if (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
- }
-
- return (unsigned long)p - (unsigned long)rdma_resp;
-}
-
-void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
-{
- struct rpcrdma_write_array *ary;
-
- /* no read-list */
- rmsgp->rm_body.rm_chunks[0] = xdr_zero;
-
- /* write-array discrim */
- ary = (struct rpcrdma_write_array *)
- &rmsgp->rm_body.rm_chunks[1];
- ary->wc_discrim = xdr_one;
- ary->wc_nchunks = cpu_to_be32(chunks);
-
- /* write-list terminator */
- ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
-
- /* reply-array discriminator */
- ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
-}
-
-void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
- int chunks)
-{
- ary->wc_discrim = xdr_one;
- ary->wc_nchunks = cpu_to_be32(chunks);
-}
-
-void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
- int chunk_no,
- __be32 rs_handle,
- __be64 rs_offset,
- u32 write_len)
-{
- struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
- seg->rs_handle = rs_handle;
- seg->rs_offset = rs_offset;
- seg->rs_length = cpu_to_be32(write_len);
-}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index f7b2daf72a86..27a99bf5b1a6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -558,33 +558,85 @@ static void rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_arg.buflen = head->arg.buflen;
}
+static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
+ __be32 *rdma_argp, int status)
+{
+ struct svc_rdma_op_ctxt *ctxt;
+ __be32 *p, *err_msgp;
+ unsigned int length;
+ struct page *page;
+ int ret;
+
+ ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
+ if (ret)
+ return;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return;
+ err_msgp = page_address(page);
+
+ p = err_msgp;
+ *p++ = *rdma_argp;
+ *p++ = *(rdma_argp + 1);
+ *p++ = xprt->sc_fc_credits;
+ *p++ = rdma_error;
+ if (status == -EPROTONOSUPPORT) {
+ *p++ = err_vers;
+ *p++ = rpcrdma_version;
+ *p++ = rpcrdma_version;
+ } else {
+ *p++ = err_chunk;
+ }
+ length = (unsigned long)p - (unsigned long)err_msgp;
+
+ /* Map transport header; no RPC message payload */
+ ctxt = svc_rdma_get_context(xprt);
+ ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length);
+ if (ret) {
+ dprintk("svcrdma: Error %d mapping send for protocol error\n",
+ ret);
+ return;
+ }
+
+ ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
+ if (ret) {
+ dprintk("svcrdma: Error %d posting send for protocol error\n",
+ ret);
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ }
+}
+
/* By convention, backchannel calls arrive via rdma_msg type
* messages, and never populate the chunk lists. This makes
* the RPC/RDMA header small and fixed in size, so it is
* straightforward to check the RPC header's direction field.
*/
-static bool
-svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
+static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
+ __be32 *rdma_resp)
{
- __be32 *p = (__be32 *)rmsgp;
+ __be32 *p;
if (!xprt->xpt_bc_xprt)
return false;
- if (rmsgp->rm_type != rdma_msg)
+ p = rdma_resp + 3;
+ if (*p++ != rdma_msg)
return false;
- if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
+
+ if (*p++ != xdr_zero)
return false;
- if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- /* sanity */
- if (p[7] != rmsgp->rm_xid)
+ /* XID sanity */
+ if (*p++ != *rdma_resp)
return false;
/* call direction */
- if (p[8] == cpu_to_be32(RPC_CALL))
+ if (*p == cpu_to_be32(RPC_CALL))
return false;
return true;
@@ -650,8 +702,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
goto out_drop;
rqstp->rq_xprt_hlen = ret;
- if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
- ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
+ if (svc_rdma_is_backchannel_reply(xprt, &rmsgp->rm_xid)) {
+ ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt,
+ &rmsgp->rm_xid,
&rqstp->rq_arg);
svc_rdma_put_context(ctxt, 0);
if (ret)
@@ -686,7 +739,7 @@ complete:
return ret;
out_err:
- svc_rdma_send_error(rdma_xprt, rmsgp, ret);
+ svc_rdma_send_error(rdma_xprt, &rmsgp->rm_xid, ret);
svc_rdma_put_context(ctxt, 0);
return 0;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
new file mode 100644
index 000000000000..0cf620277693
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
+ */
+
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/debug.h>
+
+#include <rdma/rw.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+/* Each R/W context contains state for one chain of RDMA Read or
+ * Write Work Requests.
+ *
+ * Each WR chain handles a single contiguous server-side buffer,
+ * because scatterlist entries after the first have to start on
+ * page alignment. xdr_buf iovecs cannot guarantee alignment.
+ *
+ * Each WR chain handles only one R_key. Each RPC-over-RDMA segment
+ * from a client may contain a unique R_key, so each WR chain moves
+ * up to one segment at a time.
+ *
+ * The scatterlist makes this data structure over 4KB in size. To
+ * make it less likely to fail, and to handle the allocation for
+ * smaller I/O requests without disabling bottom-halves, these
+ * contexts are created on demand, but cached and reused until the
+ * controlling svcxprt_rdma is destroyed.
+ */
+struct svc_rdma_rw_ctxt {
+ struct list_head rw_list;
+ struct rdma_rw_ctx rw_ctx;
+ int rw_nents;
+ struct sg_table rw_sg_table;
+ struct scatterlist rw_first_sgl[0];
+};
+
+static inline struct svc_rdma_rw_ctxt *
+svc_rdma_next_ctxt(struct list_head *list)
+{
+ return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt,
+ rw_list);
+}
+
+static struct svc_rdma_rw_ctxt *
+svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
+{
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ spin_lock(&rdma->sc_rw_ctxt_lock);
+
+ ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts);
+ if (ctxt) {
+ list_del(&ctxt->rw_list);
+ spin_unlock(&rdma->sc_rw_ctxt_lock);
+ } else {
+ spin_unlock(&rdma->sc_rw_ctxt_lock);
+ ctxt = kmalloc(sizeof(*ctxt) +
+ SG_CHUNK_SIZE * sizeof(struct scatterlist),
+ GFP_KERNEL);
+ if (!ctxt)
+ goto out;
+ INIT_LIST_HEAD(&ctxt->rw_list);
+ }
+
+ ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
+ if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
+ ctxt->rw_sg_table.sgl)) {
+ kfree(ctxt);
+ ctxt = NULL;
+ }
+out:
+ return ctxt;
+}
+
+static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
+ struct svc_rdma_rw_ctxt *ctxt)
+{
+ sg_free_table_chained(&ctxt->rw_sg_table, true);
+
+ spin_lock(&rdma->sc_rw_ctxt_lock);
+ list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
+ spin_unlock(&rdma->sc_rw_ctxt_lock);
+}
+
+/**
+ * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts
+ * @rdma: transport about to be destroyed
+ *
+ */
+void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
+{
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) {
+ list_del(&ctxt->rw_list);
+ kfree(ctxt);
+ }
+}
+
+/* A chunk context tracks all I/O for moving one Read or Write
+ * chunk. This is a a set of rdma_rw's that handle data movement
+ * for all segments of one chunk.
+ *
+ * These are small, acquired with a single allocator call, and
+ * no more than one is needed per chunk. They are allocated on
+ * demand, and not cached.
+ */
+struct svc_rdma_chunk_ctxt {
+ struct ib_cqe cc_cqe;
+ struct svcxprt_rdma *cc_rdma;
+ struct list_head cc_rwctxts;
+ int cc_sqecount;
+ enum dma_data_direction cc_dir;
+};
+
+static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
+ struct svc_rdma_chunk_ctxt *cc,
+ enum dma_data_direction dir)
+{
+ cc->cc_rdma = rdma;
+ svc_xprt_get(&rdma->sc_xprt);
+
+ INIT_LIST_HEAD(&cc->cc_rwctxts);
+ cc->cc_sqecount = 0;
+ cc->cc_dir = dir;
+}
+
+static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc)
+{
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
+ list_del(&ctxt->rw_list);
+
+ rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+ ctxt->rw_nents, cc->cc_dir);
+ svc_rdma_put_rw_ctxt(rdma, ctxt);
+ }
+ svc_xprt_put(&rdma->sc_xprt);
+}
+
+/* State for sending a Write or Reply chunk.
+ * - Tracks progress of writing one chunk over all its segments
+ * - Stores arguments for the SGL constructor functions
+ */
+struct svc_rdma_write_info {
+ /* write state of this chunk */
+ unsigned int wi_seg_off;
+ unsigned int wi_seg_no;
+ unsigned int wi_nsegs;
+ __be32 *wi_segs;
+
+ /* SGL constructor arguments */
+ struct xdr_buf *wi_xdr;
+ unsigned char *wi_base;
+ unsigned int wi_next_off;
+
+ struct svc_rdma_chunk_ctxt wi_cc;
+};
+
+static struct svc_rdma_write_info *
+svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
+{
+ struct svc_rdma_write_info *info;
+
+ info = kmalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return info;
+
+ info->wi_seg_off = 0;
+ info->wi_seg_no = 0;
+ info->wi_nsegs = be32_to_cpup(++chunk);
+ info->wi_segs = ++chunk;
+ svc_rdma_cc_init(rdma, &info->wi_cc, DMA_TO_DEVICE);
+ return info;
+}
+
+static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
+{
+ svc_rdma_cc_release(&info->wi_cc);
+ kfree(info);
+}
+
+/**
+ * svc_rdma_write_done - Write chunk completion
+ * @cq: controlling Completion Queue
+ * @wc: Work Completion
+ *
+ * Pages under I/O are freed by a subsequent Send completion.
+ */
+static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct svc_rdma_chunk_ctxt *cc =
+ container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_rdma_write_info *info =
+ container_of(cc, struct svc_rdma_write_info, wi_cc);
+
+ atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+ wake_up(&rdma->sc_send_wait);
+
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ pr_err("svcrdma: write ctx: %s (%u/0x%x)\n",
+ ib_wc_status_msg(wc->status),
+ wc->status, wc->vendor_err);
+ }
+
+ svc_rdma_write_info_free(info);
+}
+
+/* This function sleeps when the transport's Send Queue is congested.
+ *
+ * Assumptions:
+ * - If ib_post_send() succeeds, only one completion is expected,
+ * even if one or more WRs are flushed. This is true when posting
+ * an rdma_rw_ctx or when posting a single signaled WR.
+ */
+static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
+{
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_xprt *xprt = &rdma->sc_xprt;
+ struct ib_send_wr *first_wr, *bad_wr;
+ struct list_head *tmp;
+ struct ib_cqe *cqe;
+ int ret;
+
+ first_wr = NULL;
+ cqe = &cc->cc_cqe;
+ list_for_each(tmp, &cc->cc_rwctxts) {
+ struct svc_rdma_rw_ctxt *ctxt;
+
+ ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list);
+ first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, cqe, first_wr);
+ cqe = NULL;
+ }
+
+ do {
+ if (atomic_sub_return(cc->cc_sqecount,
+ &rdma->sc_sq_avail) > 0) {
+ ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+ if (ret)
+ break;
+ return 0;
+ }
+
+ atomic_inc(&rdma_stat_sq_starve);
+ atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+ wait_event(rdma->sc_send_wait,
+ atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
+ } while (1);
+
+ pr_err("svcrdma: ib_post_send failed (%d)\n", ret);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+
+ /* If even one was posted, there will be a completion. */
+ if (bad_wr != first_wr)
+ return 0;
+
+ atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+ wake_up(&rdma->sc_send_wait);
+ return -ENOTCONN;
+}
+
+/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
+ */
+static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
+ unsigned int len,
+ struct svc_rdma_rw_ctxt *ctxt)
+{
+ struct scatterlist *sg = ctxt->rw_sg_table.sgl;
+
+ sg_set_buf(&sg[0], info->wi_base, len);
+ info->wi_base += len;
+
+ ctxt->rw_nents = 1;
+}
+
+/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
+ */
+static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
+ unsigned int remaining,
+ struct svc_rdma_rw_ctxt *ctxt)
+{
+ unsigned int sge_no, sge_bytes, page_off, page_no;
+ struct xdr_buf *xdr = info->wi_xdr;
+ struct scatterlist *sg;
+ struct page **page;
+
+ page_off = (info->wi_next_off + xdr->page_base) & ~PAGE_MASK;
+ page_no = (info->wi_next_off + xdr->page_base) >> PAGE_SHIFT;
+ page = xdr->pages + page_no;
+ info->wi_next_off += remaining;
+ sg = ctxt->rw_sg_table.sgl;
+ sge_no = 0;
+ do {
+ sge_bytes = min_t(unsigned int, remaining,
+ PAGE_SIZE - page_off);
+ sg_set_page(sg, *page, sge_bytes, page_off);
+
+ remaining -= sge_bytes;
+ sg = sg_next(sg);
+ page_off = 0;
+ sge_no++;
+ page++;
+ } while (remaining);
+
+ ctxt->rw_nents = sge_no;
+}
+
+/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
+ * an RPC Reply.
+ */
+static int
+svc_rdma_build_writes(struct svc_rdma_write_info *info,
+ void (*constructor)(struct svc_rdma_write_info *info,
+ unsigned int len,
+ struct svc_rdma_rw_ctxt *ctxt),
+ unsigned int remaining)
+{
+ struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
+ struct svcxprt_rdma *rdma = cc->cc_rdma;
+ struct svc_rdma_rw_ctxt *ctxt;
+ __be32 *seg;
+ int ret;
+
+ cc->cc_cqe.done = svc_rdma_write_done;
+ seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
+ do {
+ unsigned int write_len;
+ u32 seg_length, seg_handle;
+ u64 seg_offset;
+
+ if (info->wi_seg_no >= info->wi_nsegs)
+ goto out_overflow;
+
+ seg_handle = be32_to_cpup(seg);
+ seg_length = be32_to_cpup(seg + 1);
+ xdr_decode_hyper(seg + 2, &seg_offset);
+ seg_offset += info->wi_seg_off;
+
+ write_len = min(remaining, seg_length - info->wi_seg_off);
+ ctxt = svc_rdma_get_rw_ctxt(rdma,
+ (write_len >> PAGE_SHIFT) + 2);
+ if (!ctxt)
+ goto out_noctx;
+
+ constructor(info, write_len, ctxt);
+ ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp,
+ rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+ ctxt->rw_nents, 0, seg_offset,
+ seg_handle, DMA_TO_DEVICE);
+ if (ret < 0)
+ goto out_initerr;
+
+ list_add(&ctxt->rw_list, &cc->cc_rwctxts);
+ cc->cc_sqecount += ret;
+ if (write_len == seg_length - info->wi_seg_off) {
+ seg += 4;
+ info->wi_seg_no++;
+ info->wi_seg_off = 0;
+ } else {
+ info->wi_seg_off += write_len;
+ }
+ remaining -= write_len;
+ } while (remaining);
+
+ return 0;
+
+out_overflow:
+ dprintk("svcrdma: inadequate space in Write chunk (%u)\n",
+ info->wi_nsegs);
+ return -E2BIG;
+
+out_noctx:
+ dprintk("svcrdma: no R/W ctxs available\n");
+ return -ENOMEM;
+
+out_initerr:
+ svc_rdma_put_rw_ctxt(rdma, ctxt);
+ pr_err("svcrdma: failed to map pagelist (%d)\n", ret);
+ return -EIO;
+}
+
+/* Send one of an xdr_buf's kvecs by itself. To send a Reply
+ * chunk, the whole RPC Reply is written back to the client.
+ * This function writes either the head or tail of the xdr_buf
+ * containing the Reply.
+ */
+static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
+ struct kvec *vec)
+{
+ info->wi_base = vec->iov_base;
+ return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
+ vec->iov_len);
+}
+
+/* Send an xdr_buf's page list by itself. A Write chunk is
+ * just the page list. a Reply chunk is the head, page list,
+ * and tail. This function is shared between the two types
+ * of chunk.
+ */
+static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
+ struct xdr_buf *xdr)
+{
+ info->wi_xdr = xdr;
+ info->wi_next_off = 0;
+ return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
+ xdr->page_len);
+}
+
+/**
+ * svc_rdma_send_write_chunk - Write all segments in a Write chunk
+ * @rdma: controlling RDMA transport
+ * @wr_ch: Write chunk provided by client
+ * @xdr: xdr_buf containing the data payload
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ * %-E2BIG if the payload was larger than the Write chunk,
+ * %-ENOMEM if rdma_rw context pool was exhausted,
+ * %-ENOTCONN if posting failed (connection is lost),
+ * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
+ struct xdr_buf *xdr)
+{
+ struct svc_rdma_write_info *info;
+ int ret;
+
+ if (!xdr->page_len)
+ return 0;
+
+ info = svc_rdma_write_info_alloc(rdma, wr_ch);
+ if (!info)
+ return -ENOMEM;
+
+ ret = svc_rdma_send_xdr_pagelist(info, xdr);
+ if (ret < 0)
+ goto out_err;
+
+ ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ if (ret < 0)
+ goto out_err;
+ return xdr->page_len;
+
+out_err:
+ svc_rdma_write_info_free(info);
+ return ret;
+}
+
+/**
+ * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
+ * @rdma: controlling RDMA transport
+ * @rp_ch: Reply chunk provided by client
+ * @writelist: true if client provided a Write list
+ * @xdr: xdr_buf containing an RPC Reply
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ * %-E2BIG if the payload was larger than the Reply chunk,
+ * %-ENOMEM if rdma_rw context pool was exhausted,
+ * %-ENOTCONN if posting failed (connection is lost),
+ * %-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
+ bool writelist, struct xdr_buf *xdr)
+{
+ struct svc_rdma_write_info *info;
+ int consumed, ret;
+
+ info = svc_rdma_write_info_alloc(rdma, rp_ch);
+ if (!info)
+ return -ENOMEM;
+
+ ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
+ if (ret < 0)
+ goto out_err;
+ consumed = xdr->head[0].iov_len;
+
+ /* Send the page list in the Reply chunk only if the
+ * client did not provide Write chunks.
+ */
+ if (!writelist && xdr->page_len) {
+ ret = svc_rdma_send_xdr_pagelist(info, xdr);
+ if (ret < 0)
+ goto out_err;
+ consumed += xdr->page_len;
+ }
+
+ if (xdr->tail[0].iov_len) {
+ ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
+ if (ret < 0)
+ goto out_err;
+ consumed += xdr->tail[0].iov_len;
+ }
+
+ ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+ if (ret < 0)
+ goto out_err;
+ return consumed;
+
+out_err:
+ svc_rdma_write_info_free(info);
+ return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 515221b16d09..1736337f3a55 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
* Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
*
@@ -40,6 +41,63 @@
* Author: Tom Tucker <tom@opengridcomputing.com>
*/
+/* Operation
+ *
+ * The main entry point is svc_rdma_sendto. This is called by the
+ * RPC server when an RPC Reply is ready to be transmitted to a client.
+ *
+ * The passed-in svc_rqst contains a struct xdr_buf which holds an
+ * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA
+ * transport header, post all Write WRs needed for this Reply, then post
+ * a Send WR conveying the transport header and the RPC message itself to
+ * the client.
+ *
+ * svc_rdma_sendto must fully transmit the Reply before returning, as
+ * the svc_rqst will be recycled as soon as sendto returns. Remaining
+ * resources referred to by the svc_rqst are also recycled at that time.
+ * Therefore any resources that must remain longer must be detached
+ * from the svc_rqst and released later.
+ *
+ * Page Management
+ *
+ * The I/O that performs Reply transmission is asynchronous, and may
+ * complete well after sendto returns. Thus pages under I/O must be
+ * removed from the svc_rqst before sendto returns.
+ *
+ * The logic here depends on Send Queue and completion ordering. Since
+ * the Send WR is always posted last, it will always complete last. Thus
+ * when it completes, it is guaranteed that all previous Write WRs have
+ * also completed.
+ *
+ * Write WRs are constructed and posted. Each Write segment gets its own
+ * svc_rdma_rw_ctxt, allowing the Write completion handler to find and
+ * DMA-unmap the pages under I/O for that Write segment. The Write
+ * completion handler does not release any pages.
+ *
+ * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt.
+ * The ownership of all of the Reply's pages are transferred into that
+ * ctxt, the Send WR is posted, and sendto returns.
+ *
+ * The svc_rdma_op_ctxt is presented when the Send WR completes. The
+ * Send completion handler finally releases the Reply's pages.
+ *
+ * This mechanism also assumes that completions on the transport's Send
+ * Completion Queue do not run in parallel. Otherwise a Write completion
+ * and Send completion running at the same time could release pages that
+ * are still DMA-mapped.
+ *
+ * Error Handling
+ *
+ * - If the Send WR is posted successfully, it will either complete
+ * successfully, or get flushed. Either way, the Send completion
+ * handler releases the Reply's pages.
+ * - If the Send WR cannot be not posted, the forward path releases
+ * the Reply's pages.
+ *
+ * This handles the case, without the use of page reference counting,
+ * where two different Write segments send portions of the same page.
+ */
+
#include <linux/sunrpc/debug.h>
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/spinlock.h>
@@ -55,113 +113,141 @@ static u32 xdr_padsize(u32 len)
return (len & 3) ? (4 - (len & 3)) : 0;
}
-int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- struct svc_rdma_req_map *vec,
- bool write_chunk_present)
+/* Returns length of transport header, in bytes.
+ */
+static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp)
{
- int sge_no;
- u32 sge_bytes;
- u32 page_bytes;
- u32 page_off;
- int page_no;
-
- if (xdr->len !=
- (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
- pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
- return -EIO;
- }
+ unsigned int nsegs;
+ __be32 *p;
- /* Skip the first sge, this is for the RPCRDMA header */
- sge_no = 1;
+ p = rdma_resp;
+
+ /* RPC-over-RDMA V1 replies never have a Read list. */
+ p += rpcrdma_fixed_maxsz + 1;
- /* Head SGE */
- vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
- vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
- sge_no++;
-
- /* pages SGE */
- page_no = 0;
- page_bytes = xdr->page_len;
- page_off = xdr->page_base;
- while (page_bytes) {
- vec->sge[sge_no].iov_base =
- page_address(xdr->pages[page_no]) + page_off;
- sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
- page_bytes -= sge_bytes;
- vec->sge[sge_no].iov_len = sge_bytes;
-
- sge_no++;
- page_no++;
- page_off = 0; /* reset for next time through loop */
+ /* Skip Write list. */
+ while (*p++ != xdr_zero) {
+ nsegs = be32_to_cpup(p++);
+ p += nsegs * rpcrdma_segment_maxsz;
}
- /* Tail SGE */
- if (xdr->tail[0].iov_len) {
- unsigned char *base = xdr->tail[0].iov_base;
- size_t len = xdr->tail[0].iov_len;
- u32 xdr_pad = xdr_padsize(xdr->page_len);
+ /* Skip Reply chunk. */
+ if (*p++ != xdr_zero) {
+ nsegs = be32_to_cpup(p++);
+ p += nsegs * rpcrdma_segment_maxsz;
+ }
- if (write_chunk_present && xdr_pad) {
- base += xdr_pad;
- len -= xdr_pad;
- }
+ return (unsigned long)p - (unsigned long)rdma_resp;
+}
- if (len) {
- vec->sge[sge_no].iov_base = base;
- vec->sge[sge_no].iov_len = len;
- sge_no++;
+/* One Write chunk is copied from Call transport header to Reply
+ * transport header. Each segment's length field is updated to
+ * reflect number of bytes consumed in the segment.
+ *
+ * Returns number of segments in this chunk.
+ */
+static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src,
+ unsigned int remaining)
+{
+ unsigned int i, nsegs;
+ u32 seg_len;
+
+ /* Write list discriminator */
+ *dst++ = *src++;
+
+ /* number of segments in this chunk */
+ nsegs = be32_to_cpup(src);
+ *dst++ = *src++;
+
+ for (i = nsegs; i; i--) {
+ /* segment's RDMA handle */
+ *dst++ = *src++;
+
+ /* bytes returned in this segment */
+ seg_len = be32_to_cpu(*src);
+ if (remaining >= seg_len) {
+ /* entire segment was consumed */
+ *dst = *src;
+ remaining -= seg_len;
+ } else {
+ /* segment only partly filled */
+ *dst = cpu_to_be32(remaining);
+ remaining = 0;
}
- }
+ dst++; src++;
- dprintk("svcrdma: %s: sge_no %d page_no %d "
- "page_base %u page_len %u head_len %zu tail_len %zu\n",
- __func__, sge_no, page_no, xdr->page_base, xdr->page_len,
- xdr->head[0].iov_len, xdr->tail[0].iov_len);
+ /* segment's RDMA offset */
+ *dst++ = *src++;
+ *dst++ = *src++;
+ }
- vec->count = sge_no;
- return 0;
+ return nsegs;
}
-static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- u32 xdr_off, size_t len, int dir)
+/* The client provided a Write list in the Call message. Fill in
+ * the segments in the first Write chunk in the Reply's transport
+ * header with the number of bytes consumed in each segment.
+ * Remaining chunks are returned unused.
+ *
+ * Assumptions:
+ * - Client has provided only one Write chunk
+ */
+static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch,
+ unsigned int consumed)
{
- struct page *page;
- dma_addr_t dma_addr;
- if (xdr_off < xdr->head[0].iov_len) {
- /* This offset is in the head */
- xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
- page = virt_to_page(xdr->head[0].iov_base);
- } else {
- xdr_off -= xdr->head[0].iov_len;
- if (xdr_off < xdr->page_len) {
- /* This offset is in the page list */
- xdr_off += xdr->page_base;
- page = xdr->pages[xdr_off >> PAGE_SHIFT];
- xdr_off &= ~PAGE_MASK;
- } else {
- /* This offset is in the tail */
- xdr_off -= xdr->page_len;
- xdr_off += (unsigned long)
- xdr->tail[0].iov_base & ~PAGE_MASK;
- page = virt_to_page(xdr->tail[0].iov_base);
- }
+ unsigned int nsegs;
+ __be32 *p, *q;
+
+ /* RPC-over-RDMA V1 replies never have a Read list. */
+ p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+
+ q = wr_ch;
+ while (*q != xdr_zero) {
+ nsegs = xdr_encode_write_chunk(p, q, consumed);
+ q += 2 + nsegs * rpcrdma_segment_maxsz;
+ p += 2 + nsegs * rpcrdma_segment_maxsz;
+ consumed = 0;
}
- dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off,
- min_t(size_t, PAGE_SIZE, len), dir);
- return dma_addr;
+
+ /* Terminate Write list */
+ *p++ = xdr_zero;
+
+ /* Reply chunk discriminator; may be replaced later */
+ *p = xdr_zero;
+}
+
+/* The client provided a Reply chunk in the Call message. Fill in
+ * the segments in the Reply chunk in the Reply message with the
+ * number of bytes consumed in each segment.
+ *
+ * Assumptions:
+ * - Reply can always fit in the provided Reply chunk
+ */
+static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
+ unsigned int consumed)
+{
+ __be32 *p;
+
+ /* Find the Reply chunk in the Reply's xprt header.
+ * RPC-over-RDMA V1 replies never have a Read list.
+ */
+ p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+
+ /* Skip past Write list */
+ while (*p++ != xdr_zero)
+ p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+
+ xdr_encode_write_chunk(p, rp_ch, consumed);
}
/* Parse the RPC Call's transport header.
*/
-static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
- struct rpcrdma_write_array **write,
- struct rpcrdma_write_array **reply)
+static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
+ __be32 **write, __be32 **reply)
{
__be32 *p;
- p = (__be32 *)&rmsgp->rm_body.rm_chunks[0];
+ p = rdma_argp + rpcrdma_fixed_maxsz;
/* Read list */
while (*p++ != xdr_zero)
@@ -169,7 +255,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
/* Write list */
if (*p != xdr_zero) {
- *write = (struct rpcrdma_write_array *)p;
+ *write = p;
while (*p++ != xdr_zero)
p += 1 + be32_to_cpu(*p) * 4;
} else {
@@ -179,7 +265,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
/* Reply chunk */
if (*p != xdr_zero)
- *reply = (struct rpcrdma_write_array *)p;
+ *reply = p;
else
*reply = NULL;
}
@@ -189,360 +275,321 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
* Invalidate, and responder chooses one rkey to invalidate.
*
* Find a candidate rkey to invalidate when sending a reply. Picks the
- * first rkey it finds in the chunks lists.
+ * first R_key it finds in the chunk lists.
*
* Returns zero if RPC's chunk lists are empty.
*/
-static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
- struct rpcrdma_write_array *wr_ary,
- struct rpcrdma_write_array *rp_ary)
+static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
+ __be32 *wr_lst, __be32 *rp_ch)
{
- struct rpcrdma_read_chunk *rd_ary;
- struct rpcrdma_segment *arg_ch;
+ __be32 *p;
- rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0];
- if (rd_ary->rc_discrim != xdr_zero)
- return be32_to_cpu(rd_ary->rc_target.rs_handle);
+ p = rdma_argp + rpcrdma_fixed_maxsz;
+ if (*p != xdr_zero)
+ p += 2;
+ else if (wr_lst && be32_to_cpup(wr_lst + 1))
+ p = wr_lst + 2;
+ else if (rp_ch && be32_to_cpup(rp_ch + 1))
+ p = rp_ch + 2;
+ else
+ return 0;
+ return be32_to_cpup(p);
+}
- if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) {
- arg_ch = &wr_ary->wc_array[0].wc_target;
- return be32_to_cpu(arg_ch->rs_handle);
- }
+/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
+ * is used during completion to DMA-unmap this memory, and
+ * it uses ib_dma_unmap_page() exclusively.
+ */
+static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt,
+ unsigned int sge_no,
+ unsigned char *base,
+ unsigned int len)
+{
+ unsigned long offset = (unsigned long)base & ~PAGE_MASK;
+ struct ib_device *dev = rdma->sc_cm_id->device;
+ dma_addr_t dma_addr;
- if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) {
- arg_ch = &rp_ary->wc_array[0].wc_target;
- return be32_to_cpu(arg_ch->rs_handle);
- }
+ dma_addr = ib_dma_map_page(dev, virt_to_page(base),
+ offset, len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(dev, dma_addr))
+ return -EIO;
+ ctxt->sge[sge_no].addr = dma_addr;
+ ctxt->sge[sge_no].length = len;
+ ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+ svc_rdma_count_mappings(rdma, ctxt);
return 0;
}
-/* Assumptions:
- * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
- */
-static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
- u32 rmr, u64 to,
- u32 xdr_off, int write_len,
- struct svc_rdma_req_map *vec)
+static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt,
+ unsigned int sge_no,
+ struct page *page,
+ unsigned int offset,
+ unsigned int len)
{
- struct ib_rdma_wr write_wr;
- struct ib_sge *sge;
- int xdr_sge_no;
- int sge_no;
- int sge_bytes;
- int sge_off;
- int bc;
- struct svc_rdma_op_ctxt *ctxt;
+ struct ib_device *dev = rdma->sc_cm_id->device;
+ dma_addr_t dma_addr;
- if (vec->count > RPCSVC_MAXPAGES) {
- pr_err("svcrdma: Too many pages (%lu)\n", vec->count);
+ dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(dev, dma_addr))
return -EIO;
- }
- dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
- "write_len=%d, vec->sge=%p, vec->count=%lu\n",
- rmr, (unsigned long long)to, xdr_off,
- write_len, vec->sge, vec->count);
+ ctxt->sge[sge_no].addr = dma_addr;
+ ctxt->sge[sge_no].length = len;
+ ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+ svc_rdma_count_mappings(rdma, ctxt);
+ return 0;
+}
- ctxt = svc_rdma_get_context(xprt);
+/**
+ * svc_rdma_map_reply_hdr - DMA map the transport header buffer
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for the Send WR
+ * @rdma_resp: buffer containing transport header
+ * @len: length of transport header
+ *
+ * Returns:
+ * %0 if the header is DMA mapped,
+ * %-EIO if DMA mapping failed.
+ */
+int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt,
+ __be32 *rdma_resp,
+ unsigned int len)
+{
ctxt->direction = DMA_TO_DEVICE;
- sge = ctxt->sge;
-
- /* Find the SGE associated with xdr_off */
- for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
- xdr_sge_no++) {
- if (vec->sge[xdr_sge_no].iov_len > bc)
- break;
- bc -= vec->sge[xdr_sge_no].iov_len;
- }
-
- sge_off = bc;
- bc = write_len;
- sge_no = 0;
-
- /* Copy the remaining SGE */
- while (bc != 0) {
- sge_bytes = min_t(size_t,
- bc, vec->sge[xdr_sge_no].iov_len-sge_off);
- sge[sge_no].length = sge_bytes;
- sge[sge_no].addr =
- dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
- sge_bytes, DMA_TO_DEVICE);
- xdr_off += sge_bytes;
- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
- sge[sge_no].addr))
- goto err;
- svc_rdma_count_mappings(xprt, ctxt);
- sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
- ctxt->count++;
- sge_off = 0;
- sge_no++;
- xdr_sge_no++;
- if (xdr_sge_no > vec->count) {
- pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no);
- goto err;
- }
- bc -= sge_bytes;
- if (sge_no == xprt->sc_max_sge)
- break;
- }
-
- /* Prepare WRITE WR */
- memset(&write_wr, 0, sizeof write_wr);
- ctxt->cqe.done = svc_rdma_wc_write;
- write_wr.wr.wr_cqe = &ctxt->cqe;
- write_wr.wr.sg_list = &sge[0];
- write_wr.wr.num_sge = sge_no;
- write_wr.wr.opcode = IB_WR_RDMA_WRITE;
- write_wr.wr.send_flags = IB_SEND_SIGNALED;
- write_wr.rkey = rmr;
- write_wr.remote_addr = to;
-
- /* Post It */
- atomic_inc(&rdma_stat_write);
- if (svc_rdma_send(xprt, &write_wr.wr))
- goto err;
- return write_len - bc;
- err:
- svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_context(ctxt, 0);
- return -EIO;
+ ctxt->pages[0] = virt_to_page(rdma_resp);
+ ctxt->count = 1;
+ return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len);
}
-noinline
-static int send_write_chunks(struct svcxprt_rdma *xprt,
- struct rpcrdma_write_array *wr_ary,
- struct rpcrdma_msg *rdma_resp,
- struct svc_rqst *rqstp,
- struct svc_rdma_req_map *vec)
+/* Load the xdr_buf into the ctxt's sge array, and DMA map each
+ * element as it is added.
+ *
+ * Returns the number of sge elements loaded on success, or
+ * a negative errno on failure.
+ */
+static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt,
+ struct xdr_buf *xdr, __be32 *wr_lst)
{
- u32 xfer_len = rqstp->rq_res.page_len;
- int write_len;
- u32 xdr_off;
- int chunk_off;
- int chunk_no;
- int nchunks;
- struct rpcrdma_write_array *res_ary;
+ unsigned int len, sge_no, remaining, page_off;
+ struct page **ppages;
+ unsigned char *base;
+ u32 xdr_pad;
int ret;
- res_ary = (struct rpcrdma_write_array *)
- &rdma_resp->rm_body.rm_chunks[1];
-
- /* Write chunks start at the pagelist */
- nchunks = be32_to_cpu(wr_ary->wc_nchunks);
- for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
- xfer_len && chunk_no < nchunks;
- chunk_no++) {
- struct rpcrdma_segment *arg_ch;
- u64 rs_offset;
-
- arg_ch = &wr_ary->wc_array[chunk_no].wc_target;
- write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length));
-
- /* Prepare the response chunk given the length actually
- * written */
- xdr_decode_hyper((__be32 *)&arg_ch->rs_offset, &rs_offset);
- svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
- arg_ch->rs_handle,
- arg_ch->rs_offset,
- write_len);
- chunk_off = 0;
- while (write_len) {
- ret = send_write(xprt, rqstp,
- be32_to_cpu(arg_ch->rs_handle),
- rs_offset + chunk_off,
- xdr_off,
- write_len,
- vec);
- if (ret <= 0)
- goto out_err;
- chunk_off += ret;
- xdr_off += ret;
- xfer_len -= ret;
- write_len -= ret;
+ sge_no = 1;
+
+ ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++,
+ xdr->head[0].iov_base,
+ xdr->head[0].iov_len);
+ if (ret < 0)
+ return ret;
+
+ /* If a Write chunk is present, the xdr_buf's page list
+ * is not included inline. However the Upper Layer may
+ * have added XDR padding in the tail buffer, and that
+ * should not be included inline.
+ */
+ if (wr_lst) {
+ base = xdr->tail[0].iov_base;
+ len = xdr->tail[0].iov_len;
+ xdr_pad = xdr_padsize(xdr->page_len);
+
+ if (len && xdr_pad) {
+ base += xdr_pad;
+ len -= xdr_pad;
}
+
+ goto tail;
+ }
+
+ ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+ page_off = xdr->page_base & ~PAGE_MASK;
+ remaining = xdr->page_len;
+ while (remaining) {
+ len = min_t(u32, PAGE_SIZE - page_off, remaining);
+
+ ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++,
+ *ppages++, page_off, len);
+ if (ret < 0)
+ return ret;
+
+ remaining -= len;
+ page_off = 0;
}
- /* Update the req with the number of chunks actually used */
- svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
- return rqstp->rq_res.page_len;
+ base = xdr->tail[0].iov_base;
+ len = xdr->tail[0].iov_len;
+tail:
+ if (len) {
+ ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len);
+ if (ret < 0)
+ return ret;
+ }
-out_err:
- pr_err("svcrdma: failed to send write chunks, rc=%d\n", ret);
- return -EIO;
+ return sge_no - 1;
}
-noinline
-static int send_reply_chunks(struct svcxprt_rdma *xprt,
- struct rpcrdma_write_array *rp_ary,
- struct rpcrdma_msg *rdma_resp,
- struct svc_rqst *rqstp,
- struct svc_rdma_req_map *vec)
+/* The svc_rqst and all resources it owns are released as soon as
+ * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
+ * so they are released by the Send completion handler.
+ */
+static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *ctxt)
{
- u32 xfer_len = rqstp->rq_res.len;
- int write_len;
- u32 xdr_off;
- int chunk_no;
- int chunk_off;
- int nchunks;
- struct rpcrdma_segment *ch;
- struct rpcrdma_write_array *res_ary;
- int ret;
+ int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
- /* XXX: need to fix when reply lists occur with read-list and or
- * write-list */
- res_ary = (struct rpcrdma_write_array *)
- &rdma_resp->rm_body.rm_chunks[2];
-
- /* xdr offset starts at RPC message */
- nchunks = be32_to_cpu(rp_ary->wc_nchunks);
- for (xdr_off = 0, chunk_no = 0;
- xfer_len && chunk_no < nchunks;
- chunk_no++) {
- u64 rs_offset;
- ch = &rp_ary->wc_array[chunk_no].wc_target;
- write_len = min(xfer_len, be32_to_cpu(ch->rs_length));
-
- /* Prepare the reply chunk given the length actually
- * written */
- xdr_decode_hyper((__be32 *)&ch->rs_offset, &rs_offset);
- svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
- ch->rs_handle, ch->rs_offset,
- write_len);
- chunk_off = 0;
- while (write_len) {
- ret = send_write(xprt, rqstp,
- be32_to_cpu(ch->rs_handle),
- rs_offset + chunk_off,
- xdr_off,
- write_len,
- vec);
- if (ret <= 0)
- goto out_err;
- chunk_off += ret;
- xdr_off += ret;
- xfer_len -= ret;
- write_len -= ret;
- }
+ ctxt->count += pages;
+ for (i = 0; i < pages; i++) {
+ ctxt->pages[i + 1] = rqstp->rq_respages[i];
+ rqstp->rq_respages[i] = NULL;
}
- /* Update the req with the number of chunks actually used */
- svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+}
- return rqstp->rq_res.len;
+/**
+ * svc_rdma_post_send_wr - Set up and post one Send Work Request
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for transmitting the Send WR
+ * @num_sge: number of SGEs to send
+ * @inv_rkey: R_key argument to Send With Invalidate, or zero
+ *
+ * Returns:
+ * %0 if the Send* was posted successfully,
+ * %-ENOTCONN if the connection was lost or dropped,
+ * %-EINVAL if there was a problem with the Send we built,
+ * %-ENOMEM if ib_post_send failed.
+ */
+int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
+ struct svc_rdma_op_ctxt *ctxt, int num_sge,
+ u32 inv_rkey)
+{
+ struct ib_send_wr *send_wr = &ctxt->send_wr;
-out_err:
- pr_err("svcrdma: failed to send reply chunks, rc=%d\n", ret);
- return -EIO;
+ dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge);
+
+ send_wr->next = NULL;
+ ctxt->cqe.done = svc_rdma_wc_send;
+ send_wr->wr_cqe = &ctxt->cqe;
+ send_wr->sg_list = ctxt->sge;
+ send_wr->num_sge = num_sge;
+ send_wr->send_flags = IB_SEND_SIGNALED;
+ if (inv_rkey) {
+ send_wr->opcode = IB_WR_SEND_WITH_INV;
+ send_wr->ex.invalidate_rkey = inv_rkey;
+ } else {
+ send_wr->opcode = IB_WR_SEND;
+ }
+
+ return svc_rdma_send(rdma, send_wr);
}
-/* This function prepares the portion of the RPCRDMA message to be
- * sent in the RDMA_SEND. This function is called after data sent via
- * RDMA has already been transmitted. There are three cases:
- * - The RPCRDMA header, RPC header, and payload are all sent in a
- * single RDMA_SEND. This is the "inline" case.
- * - The RPCRDMA header and some portion of the RPC header and data
- * are sent via this RDMA_SEND and another portion of the data is
- * sent via RDMA.
- * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
- * header and data are all transmitted via RDMA.
- * In all three cases, this function prepares the RPCRDMA header in
- * sge[0], the 'type' parameter indicates the type to place in the
- * RPCRDMA header, and the 'byte_count' field indicates how much of
- * the XDR to include in this RDMA_SEND. NB: The offset of the payload
- * to send is zero in the XDR.
+/* Prepare the portion of the RPC Reply that will be transmitted
+ * via RDMA Send. The RPC-over-RDMA transport header is prepared
+ * in sge[0], and the RPC xdr_buf is prepared in following sges.
+ *
+ * Depending on whether a Write list or Reply chunk is present,
+ * the server may send all, a portion of, or none of the xdr_buf.
+ * In the latter case, only the transport header (sge[0]) is
+ * transmitted.
+ *
+ * RDMA Send is the last step of transmitting an RPC reply. Pages
+ * involved in the earlier RDMA Writes are here transferred out
+ * of the rqstp and into the ctxt's page array. These pages are
+ * DMA unmapped by each Write completion, but the subsequent Send
+ * completion finally releases these pages.
+ *
+ * Assumptions:
+ * - The Reply's transport header will never be larger than a page.
*/
-static int send_reply(struct svcxprt_rdma *rdma,
- struct svc_rqst *rqstp,
- struct page *page,
- struct rpcrdma_msg *rdma_resp,
- struct svc_rdma_req_map *vec,
- int byte_count,
- u32 inv_rkey)
+static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
+ __be32 *rdma_argp, __be32 *rdma_resp,
+ struct svc_rqst *rqstp,
+ __be32 *wr_lst, __be32 *rp_ch)
{
struct svc_rdma_op_ctxt *ctxt;
- struct ib_send_wr send_wr;
- u32 xdr_off;
- int sge_no;
- int sge_bytes;
- int page_no;
- int pages;
- int ret = -EIO;
-
- /* Prepare the context */
+ u32 inv_rkey;
+ int ret;
+
+ dprintk("svcrdma: sending %s reply: head=%zu, pagelen=%u, tail=%zu\n",
+ (rp_ch ? "RDMA_NOMSG" : "RDMA_MSG"),
+ rqstp->rq_res.head[0].iov_len,
+ rqstp->rq_res.page_len,
+ rqstp->rq_res.tail[0].iov_len);
+
ctxt = svc_rdma_get_context(rdma);
- ctxt->direction = DMA_TO_DEVICE;
- ctxt->pages[0] = page;
- ctxt->count = 1;
- /* Prepare the SGE for the RPCRDMA Header */
- ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->sge[0].length =
- svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
- ctxt->sge[0].addr =
- ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
- ctxt->sge[0].length, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
+ ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp,
+ svc_rdma_reply_hdr_len(rdma_resp));
+ if (ret < 0)
goto err;
- svc_rdma_count_mappings(rdma, ctxt);
-
- ctxt->direction = DMA_TO_DEVICE;
- /* Map the payload indicated by 'byte_count' */
- xdr_off = 0;
- for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
- sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
- byte_count -= sge_bytes;
- ctxt->sge[sge_no].addr =
- dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
- sge_bytes, DMA_TO_DEVICE);
- xdr_off += sge_bytes;
- if (ib_dma_mapping_error(rdma->sc_cm_id->device,
- ctxt->sge[sge_no].addr))
+ if (!rp_ch) {
+ ret = svc_rdma_map_reply_msg(rdma, ctxt,
+ &rqstp->rq_res, wr_lst);
+ if (ret < 0)
goto err;
- svc_rdma_count_mappings(rdma, ctxt);
- ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->sge[sge_no].length = sge_bytes;
}
- if (byte_count != 0) {
- pr_err("svcrdma: Could not map %d bytes\n", byte_count);
+
+ svc_rdma_save_io_pages(rqstp, ctxt);
+
+ inv_rkey = 0;
+ if (rdma->sc_snd_w_inv)
+ inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
+ ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey);
+ if (ret)
goto err;
- }
- /* Save all respages in the ctxt and remove them from the
- * respages array. They are our pages until the I/O
- * completes.
+ return 0;
+
+err:
+ pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ return ret;
+}
+
+/* Given the client-provided Write and Reply chunks, the server was not
+ * able to form a complete reply. Return an RDMA_ERROR message so the
+ * client can retire this RPC transaction. As above, the Send completion
+ * routine releases payload pages that were part of a previous RDMA Write.
+ *
+ * Remote Invalidation is skipped for simplicity.
+ */
+static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
+ __be32 *rdma_resp, struct svc_rqst *rqstp)
+{
+ struct svc_rdma_op_ctxt *ctxt;
+ __be32 *p;
+ int ret;
+
+ ctxt = svc_rdma_get_context(rdma);
+
+ /* Replace the original transport header with an
+ * RDMA_ERROR response. XID etc are preserved.
*/
- pages = rqstp->rq_next_page - rqstp->rq_respages;
- for (page_no = 0; page_no < pages; page_no++) {
- ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
- ctxt->count++;
- rqstp->rq_respages[page_no] = NULL;
- }
- rqstp->rq_next_page = rqstp->rq_respages + 1;
+ p = rdma_resp + 3;
+ *p++ = rdma_error;
+ *p = err_chunk;
- if (sge_no > rdma->sc_max_sge) {
- pr_err("svcrdma: Too many sges (%d)\n", sge_no);
+ ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 20);
+ if (ret < 0)
goto err;
- }
- memset(&send_wr, 0, sizeof send_wr);
- ctxt->cqe.done = svc_rdma_wc_send;
- send_wr.wr_cqe = &ctxt->cqe;
- send_wr.sg_list = ctxt->sge;
- send_wr.num_sge = sge_no;
- if (inv_rkey) {
- send_wr.opcode = IB_WR_SEND_WITH_INV;
- send_wr.ex.invalidate_rkey = inv_rkey;
- } else
- send_wr.opcode = IB_WR_SEND;
- send_wr.send_flags = IB_SEND_SIGNALED;
- ret = svc_rdma_send(rdma, &send_wr);
+ svc_rdma_save_io_pages(rqstp, ctxt);
+
+ ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, 0);
if (ret)
goto err;
return 0;
- err:
+err:
+ pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 1);
return ret;
@@ -552,39 +599,36 @@ void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
{
}
+/**
+ * svc_rdma_sendto - Transmit an RPC reply
+ * @rqstp: processed RPC request, reply XDR already in ::rq_res
+ *
+ * Any resources still associated with @rqstp are released upon return.
+ * If no reply message was possible, the connection is closed.
+ *
+ * Returns:
+ * %0 if an RPC reply has been successfully posted,
+ * %-ENOMEM if a resource shortage occurred (connection is lost),
+ * %-ENOTCONN if posting failed (connection is lost).
+ */
int svc_rdma_sendto(struct svc_rqst *rqstp)
{
struct svc_xprt *xprt = rqstp->rq_xprt;
struct svcxprt_rdma *rdma =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
- struct rpcrdma_msg *rdma_argp;
- struct rpcrdma_msg *rdma_resp;
- struct rpcrdma_write_array *wr_ary, *rp_ary;
- int ret;
- int inline_bytes;
+ __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch;
+ struct xdr_buf *xdr = &rqstp->rq_res;
struct page *res_page;
- struct svc_rdma_req_map *vec;
- u32 inv_rkey;
- __be32 *p;
-
- dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
+ int ret;
- /* Get the RDMA request header. The receive logic always
- * places this at the start of page 0.
+ /* Find the call's chunk lists to decide how to send the reply.
+ * Receive places the Call's xprt header at the start of page 0.
*/
rdma_argp = page_address(rqstp->rq_pages[0]);
- svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary);
-
- inv_rkey = 0;
- if (rdma->sc_snd_w_inv)
- inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary);
+ svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
- /* Build an req vec for the XDR */
- vec = svc_rdma_get_req_map(rdma);
- ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
- if (ret)
- goto err0;
- inline_bytes = rqstp->rq_res.len;
+ dprintk("svcrdma: preparing response for XID 0x%08x\n",
+ be32_to_cpup(rdma_argp));
/* Create the RDMA response header. xprt->xpt_mutex,
* acquired in svc_send(), serializes RPC replies. The
@@ -598,115 +642,57 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
goto err0;
rdma_resp = page_address(res_page);
- p = &rdma_resp->rm_xid;
- *p++ = rdma_argp->rm_xid;
- *p++ = rdma_argp->rm_vers;
+ p = rdma_resp;
+ *p++ = *rdma_argp;
+ *p++ = *(rdma_argp + 1);
*p++ = rdma->sc_fc_credits;
- *p++ = rp_ary ? rdma_nomsg : rdma_msg;
+ *p++ = rp_ch ? rdma_nomsg : rdma_msg;
/* Start with empty chunks */
*p++ = xdr_zero;
*p++ = xdr_zero;
*p = xdr_zero;
- /* Send any write-chunk data and build resp write-list */
- if (wr_ary) {
- ret = send_write_chunks(rdma, wr_ary, rdma_resp, rqstp, vec);
+ if (wr_lst) {
+ /* XXX: Presume the client sent only one Write chunk */
+ ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr);
if (ret < 0)
- goto err1;
- inline_bytes -= ret + xdr_padsize(ret);
+ goto err2;
+ svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret);
}
-
- /* Send any reply-list data and update resp reply-list */
- if (rp_ary) {
- ret = send_reply_chunks(rdma, rp_ary, rdma_resp, rqstp, vec);
+ if (rp_ch) {
+ ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr);
if (ret < 0)
- goto err1;
- inline_bytes -= ret;
+ goto err2;
+ svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret);
}
- /* Post a fresh Receive buffer _before_ sending the reply */
ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
if (ret)
goto err1;
-
- ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
- inline_bytes, inv_rkey);
+ ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp,
+ wr_lst, rp_ch);
if (ret < 0)
goto err0;
+ return 0;
- svc_rdma_put_req_map(rdma, vec);
- dprintk("svcrdma: send_reply returns %d\n", ret);
- return ret;
+ err2:
+ if (ret != -E2BIG)
+ goto err1;
+
+ ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
+ if (ret)
+ goto err1;
+ ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp);
+ if (ret < 0)
+ goto err0;
+ return 0;
err1:
put_page(res_page);
err0:
- svc_rdma_put_req_map(rdma, vec);
pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
ret);
- set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
return -ENOTCONN;
}
-
-void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
- int status)
-{
- struct ib_send_wr err_wr;
- struct page *p;
- struct svc_rdma_op_ctxt *ctxt;
- enum rpcrdma_errcode err;
- __be32 *va;
- int length;
- int ret;
-
- ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
- if (ret)
- return;
-
- p = alloc_page(GFP_KERNEL);
- if (!p)
- return;
- va = page_address(p);
-
- /* XDR encode an error reply */
- err = ERR_CHUNK;
- if (status == -EPROTONOSUPPORT)
- err = ERR_VERS;
- length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
-
- ctxt = svc_rdma_get_context(xprt);
- ctxt->direction = DMA_TO_DEVICE;
- ctxt->count = 1;
- ctxt->pages[0] = p;
-
- /* Prepare SGE for local address */
- ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey;
- ctxt->sge[0].length = length;
- ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
- p, 0, length, DMA_TO_DEVICE);
- if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
- dprintk("svcrdma: Error mapping buffer for protocol error\n");
- svc_rdma_put_context(ctxt, 1);
- return;
- }
- svc_rdma_count_mappings(xprt, ctxt);
-
- /* Prepare SEND WR */
- memset(&err_wr, 0, sizeof(err_wr));
- ctxt->cqe.done = svc_rdma_wc_send;
- err_wr.wr_cqe = &ctxt->cqe;
- err_wr.sg_list = ctxt->sge;
- err_wr.num_sge = 1;
- err_wr.opcode = IB_WR_SEND;
- err_wr.send_flags = IB_SEND_SIGNALED;
-
- /* Post It */
- ret = svc_rdma_send(xprt, &err_wr);
- if (ret) {
- dprintk("svcrdma: Error %d posting send for protocol error\n",
- ret);
- svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_context(ctxt, 1);
- }
-}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index fc8f14c7bfec..a9d9cb1ba4c6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -272,85 +272,6 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
}
}
-static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
-{
- struct svc_rdma_req_map *map;
-
- map = kmalloc(sizeof(*map), flags);
- if (map)
- INIT_LIST_HEAD(&map->free);
- return map;
-}
-
-static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
-{
- unsigned int i;
-
- /* One for each receive buffer on this connection. */
- i = xprt->sc_max_requests;
-
- while (i--) {
- struct svc_rdma_req_map *map;
-
- map = alloc_req_map(GFP_KERNEL);
- if (!map) {
- dprintk("svcrdma: No memory for request map\n");
- return false;
- }
- list_add(&map->free, &xprt->sc_maps);
- }
- return true;
-}
-
-struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
-{
- struct svc_rdma_req_map *map = NULL;
-
- spin_lock(&xprt->sc_map_lock);
- if (list_empty(&xprt->sc_maps))
- goto out_empty;
-
- map = list_first_entry(&xprt->sc_maps,
- struct svc_rdma_req_map, free);
- list_del_init(&map->free);
- spin_unlock(&xprt->sc_map_lock);
-
-out:
- map->count = 0;
- return map;
-
-out_empty:
- spin_unlock(&xprt->sc_map_lock);
-
- /* Pre-allocation amount was incorrect */
- map = alloc_req_map(GFP_NOIO);
- if (map)
- goto out;
-
- WARN_ONCE(1, "svcrdma: empty request map list?\n");
- return NULL;
-}
-
-void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
- struct svc_rdma_req_map *map)
-{
- spin_lock(&xprt->sc_map_lock);
- list_add(&map->free, &xprt->sc_maps);
- spin_unlock(&xprt->sc_map_lock);
-}
-
-static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
-{
- while (!list_empty(&xprt->sc_maps)) {
- struct svc_rdma_req_map *map;
-
- map = list_first_entry(&xprt->sc_maps,
- struct svc_rdma_req_map, free);
- list_del(&map->free);
- kfree(map);
- }
-}
-
/* QP event handler */
static void qp_event_handler(struct ib_event *event, void *context)
{
@@ -474,24 +395,6 @@ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
}
/**
- * svc_rdma_wc_write - Invoked by RDMA provider for each polled Write WC
- * @cq: completion queue
- * @wc: completed WR
- *
- */
-void svc_rdma_wc_write(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct ib_cqe *cqe = wc->wr_cqe;
- struct svc_rdma_op_ctxt *ctxt;
-
- svc_rdma_send_wc_common_put(cq, wc, "write");
-
- ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
- svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_context(ctxt, 0);
-}
-
-/**
* svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC
* @cq: completion queue
* @wc: completed WR
@@ -561,14 +464,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
- INIT_LIST_HEAD(&cma_xprt->sc_maps);
+ INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
init_waitqueue_head(&cma_xprt->sc_send_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
spin_lock_init(&cma_xprt->sc_frmr_q_lock);
spin_lock_init(&cma_xprt->sc_ctxt_lock);
- spin_lock_init(&cma_xprt->sc_map_lock);
+ spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
/*
* Note that this implies that the underlying transport support
@@ -999,6 +902,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt, newxprt->sc_cm_id);
dev = newxprt->sc_cm_id->device;
+ newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
/* Qualify the transport resource defaults with the
* capabilities of this particular device */
@@ -1014,13 +918,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
svcrdma_max_bc_requests);
newxprt->sc_rq_depth = newxprt->sc_max_requests +
newxprt->sc_max_bc_requests;
- newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+ newxprt->sc_sq_depth = newxprt->sc_rq_depth;
atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
if (!svc_rdma_prealloc_ctxts(newxprt))
goto errout;
- if (!svc_rdma_prealloc_maps(newxprt))
- goto errout;
/*
* Limit ORD based on client limit, local device limit, and
@@ -1050,6 +952,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
memset(&qp_attr, 0, sizeof qp_attr);
qp_attr.event_handler = qp_event_handler;
qp_attr.qp_context = &newxprt->sc_xprt;
+ qp_attr.port_num = newxprt->sc_cm_id->port_num;
+ qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests;
qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
@@ -1248,8 +1152,8 @@ static void __svc_rdma_free(struct work_struct *work)
}
rdma_dealloc_frmr_q(rdma);
+ svc_rdma_destroy_rw_ctxts(rdma);
svc_rdma_destroy_ctxts(rdma);
- svc_rdma_destroy_maps(rdma);
/* Destroy the QP if present (not a listener) */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index c717f5410776..62ecbccd9748 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -66,8 +66,8 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_inline_write_padding;
-static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
- int xprt_rdma_pad_optimize = 0;
+unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
+int xprt_rdma_pad_optimize;
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -396,7 +396,7 @@ xprt_setup_rdma(struct xprt_create *args)
new_xprt = rpcx_to_rdmax(xprt);
- rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy);
+ rc = rpcrdma_ia_open(new_xprt, sap);
if (rc)
goto out1;
@@ -457,19 +457,33 @@ out1:
return ERR_PTR(rc);
}
-/*
- * Close a connection, during shutdown or timeout/reconnect
+/**
+ * xprt_rdma_close - Close down RDMA connection
+ * @xprt: generic transport to be closed
+ *
+ * Called during transport shutdown reconnect, or device
+ * removal. Caller holds the transport's write lock.
*/
static void
xprt_rdma_close(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+ dprintk("RPC: %s: closing xprt %p\n", __func__, xprt);
- dprintk("RPC: %s: closing\n", __func__);
- if (r_xprt->rx_ep.rep_connected > 0)
+ if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) {
+ xprt_clear_connected(xprt);
+ rpcrdma_ia_remove(ia);
+ return;
+ }
+ if (ep->rep_connected == -ENODEV)
+ return;
+ if (ep->rep_connected > 0)
xprt->reestablish_timeout = 0;
xprt_disconnect_done(xprt);
- rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rpcrdma_ep_disconnect(ep, ia);
}
static void
@@ -484,6 +498,27 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
dprintk("RPC: %s: %u\n", __func__, port);
}
+/**
+ * xprt_rdma_timer - invoked when an RPC times out
+ * @xprt: controlling RPC transport
+ * @task: RPC task that timed out
+ *
+ * Invoked when the transport is still connected, but an RPC
+ * retransmit timeout occurs.
+ *
+ * Since RDMA connections don't have a keep-alive, forcibly
+ * disconnect and retry to connect. This drives full
+ * detection of the network path, and retransmissions of
+ * all pending RPCs.
+ */
+static void
+xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+ dprintk("RPC: %5u %s: xprt = %p\n", task->tk_pid, __func__, xprt);
+
+ xprt_force_disconnect(xprt);
+}
+
static void
xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
@@ -659,6 +694,8 @@ xprt_rdma_free(struct rpc_task *task)
* xprt_rdma_send_request - marshal and send an RPC request
* @task: RPC task with an RPC message in rq_snd_buf
*
+ * Caller holds the transport's write lock.
+ *
* Return values:
* 0: The request has been sent
* ENOTCONN: Caller needs to invoke connect logic then call again
@@ -685,6 +722,9 @@ xprt_rdma_send_request(struct rpc_task *task)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
int rc = 0;
+ if (!xprt_connected(xprt))
+ goto drop_connection;
+
/* On retransmit, remove any previously registered chunks */
if (unlikely(!list_empty(&req->rl_registered)))
r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
@@ -776,6 +816,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
.alloc_slot = xprt_alloc_slot,
.release_request = xprt_release_rqst_cong, /* ditto */
.set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */
+ .timer = xprt_rdma_timer,
.rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */
.set_port = xprt_rdma_set_port,
.connect = xprt_rdma_connect,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 3b332b395045..3dbce9ac4327 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -53,7 +53,7 @@
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/svc_rdma.h>
#include <asm/bitops.h>
-#include <linux/module.h> /* try_module_get()/module_put() */
+
#include <rdma/ib_cm.h>
#include "xprt_rdma.h"
@@ -69,8 +69,11 @@
/*
* internal functions
*/
+static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf);
+static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
-static struct workqueue_struct *rpcrdma_receive_wq;
+static struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
int
rpcrdma_alloc_wq(void)
@@ -180,7 +183,7 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
rep->rr_wc_flags = wc->wc_flags;
rep->rr_inv_rkey = wc->ex.invalidate_rkey;
- ib_dma_sync_single_for_cpu(rep->rr_device,
+ ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf),
rdmab_addr(rep->rr_rdmabuf),
rep->rr_len, DMA_FROM_DEVICE);
@@ -262,6 +265,21 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
__func__, ep);
complete(&ia->ri_done);
break;
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ pr_info("rpcrdma: removing device for %pIS:%u\n",
+ sap, rpc_get_port(sap));
+#endif
+ set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
+ ep->rep_connected = -ENODEV;
+ xprt_force_disconnect(&xprt->rx_xprt);
+ wait_for_completion(&ia->ri_remove_done);
+
+ ia->ri_id = NULL;
+ ia->ri_pd = NULL;
+ ia->ri_device = NULL;
+ /* Return 1 to ensure the core destroys the id. */
+ return 1;
case RDMA_CM_EVENT_ESTABLISHED:
connstate = 1;
ib_query_qp(ia->ri_id->qp, attr,
@@ -291,9 +309,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
goto connected;
case RDMA_CM_EVENT_DISCONNECTED:
connstate = -ECONNABORTED;
- goto connected;
- case RDMA_CM_EVENT_DEVICE_REMOVAL:
- connstate = -ENODEV;
connected:
dprintk("RPC: %s: %sconnected\n",
__func__, connstate > 0 ? "" : "dis");
@@ -329,14 +344,6 @@ connected:
return 0;
}
-static void rpcrdma_destroy_id(struct rdma_cm_id *id)
-{
- if (id) {
- module_put(id->device->owner);
- rdma_destroy_id(id);
- }
-}
-
static struct rdma_cm_id *
rpcrdma_create_id(struct rpcrdma_xprt *xprt,
struct rpcrdma_ia *ia, struct sockaddr *addr)
@@ -346,6 +353,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
int rc;
init_completion(&ia->ri_done);
+ init_completion(&ia->ri_remove_done);
id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP,
IB_QPT_RC);
@@ -370,16 +378,6 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
goto out;
}
- /* FIXME:
- * Until xprtrdma supports DEVICE_REMOVAL, the provider must
- * be pinned while there are active NFS/RDMA mounts to prevent
- * hangs and crashes at umount time.
- */
- if (!ia->ri_async_rc && !try_module_get(id->device->owner)) {
- dprintk("RPC: %s: Failed to get device module\n",
- __func__);
- ia->ri_async_rc = -ENODEV;
- }
rc = ia->ri_async_rc;
if (rc)
goto out;
@@ -389,21 +387,20 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
if (rc) {
dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
__func__, rc);
- goto put;
+ goto out;
}
rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
if (rc < 0) {
dprintk("RPC: %s: wait() exited: %i\n",
__func__, rc);
- goto put;
+ goto out;
}
rc = ia->ri_async_rc;
if (rc)
- goto put;
+ goto out;
return id;
-put:
- module_put(id->device->owner);
+
out:
rdma_destroy_id(id);
return ERR_PTR(rc);
@@ -413,13 +410,16 @@ out:
* Exported functions.
*/
-/*
- * Open and initialize an Interface Adapter.
- * o initializes fields of struct rpcrdma_ia, including
- * interface and provider attributes and protection zone.
+/**
+ * rpcrdma_ia_open - Open and initialize an Interface Adapter.
+ * @xprt: controlling transport
+ * @addr: IP address of remote peer
+ *
+ * Returns 0 on success, negative errno if an appropriate
+ * Interface Adapter could not be found and opened.
*/
int
-rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr)
{
struct rpcrdma_ia *ia = &xprt->rx_ia;
int rc;
@@ -427,7 +427,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) {
rc = PTR_ERR(ia->ri_id);
- goto out1;
+ goto out_err;
}
ia->ri_device = ia->ri_id->device;
@@ -435,10 +435,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd);
pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
- goto out2;
+ goto out_err;
}
- switch (memreg) {
+ switch (xprt_rdma_memreg_strategy) {
case RPCRDMA_FRMR:
if (frwr_is_supported(ia)) {
ia->ri_ops = &rpcrdma_frwr_memreg_ops;
@@ -452,28 +452,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
}
/*FALLTHROUGH*/
default:
- pr_err("rpcrdma: Unsupported memory registration mode: %d\n",
- memreg);
+ pr_err("rpcrdma: Device %s does not support memreg mode %d\n",
+ ia->ri_device->name, xprt_rdma_memreg_strategy);
rc = -EINVAL;
- goto out3;
+ goto out_err;
}
return 0;
-out3:
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
-out2:
- rpcrdma_destroy_id(ia->ri_id);
- ia->ri_id = NULL;
-out1:
+out_err:
+ rpcrdma_ia_close(ia);
return rc;
}
-/*
- * Clean up/close an IA.
- * o if event handles and PD have been initialized, free them.
- * o close the IA
+/**
+ * rpcrdma_ia_remove - Handle device driver unload
+ * @ia: interface adapter being removed
+ *
+ * Divest transport H/W resources associated with this adapter,
+ * but allow it to be restored later.
+ */
+void
+rpcrdma_ia_remove(struct rpcrdma_ia *ia)
+{
+ struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
+ rx_ia);
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_req *req;
+ struct rpcrdma_rep *rep;
+
+ cancel_delayed_work_sync(&buf->rb_refresh_worker);
+
+ /* This is similar to rpcrdma_ep_destroy, but:
+ * - Don't cancel the connect worker.
+ * - Don't call rpcrdma_ep_disconnect, which waits
+ * for another conn upcall, which will deadlock.
+ * - rdma_disconnect is unneeded, the underlying
+ * connection is already gone.
+ */
+ if (ia->ri_id->qp) {
+ ib_drain_qp(ia->ri_id->qp);
+ rdma_destroy_qp(ia->ri_id);
+ ia->ri_id->qp = NULL;
+ }
+ ib_free_cq(ep->rep_attr.recv_cq);
+ ib_free_cq(ep->rep_attr.send_cq);
+
+ /* The ULP is responsible for ensuring all DMA
+ * mappings and MRs are gone.
+ */
+ list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list)
+ rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf);
+ list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
+ rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf);
+ rpcrdma_dma_unmap_regbuf(req->rl_sendbuf);
+ rpcrdma_dma_unmap_regbuf(req->rl_recvbuf);
+ }
+ rpcrdma_destroy_mrs(buf);
+
+ /* Allow waiters to continue */
+ complete(&ia->ri_remove_done);
+}
+
+/**
+ * rpcrdma_ia_close - Clean up/close an IA.
+ * @ia: interface adapter to close
+ *
*/
void
rpcrdma_ia_close(struct rpcrdma_ia *ia)
@@ -482,13 +527,15 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
if (ia->ri_id->qp)
rdma_destroy_qp(ia->ri_id);
- rpcrdma_destroy_id(ia->ri_id);
- ia->ri_id = NULL;
+ rdma_destroy_id(ia->ri_id);
}
+ ia->ri_id = NULL;
+ ia->ri_device = NULL;
/* If the pd is still busy, xprtrdma missed freeing a resource */
if (ia->ri_pd && !IS_ERR(ia->ri_pd))
ib_dealloc_pd(ia->ri_pd);
+ ia->ri_pd = NULL;
}
/*
@@ -646,6 +693,99 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ib_free_cq(ep->rep_attr.send_cq);
}
+/* Re-establish a connection after a device removal event.
+ * Unlike a normal reconnection, a fresh PD and a new set
+ * of MRs and buffers is needed.
+ */
+static int
+rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+{
+ struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
+ int rc, err;
+
+ pr_info("%s: r_xprt = %p\n", __func__, r_xprt);
+
+ rc = -EHOSTUNREACH;
+ if (rpcrdma_ia_open(r_xprt, sap))
+ goto out1;
+
+ rc = -ENOMEM;
+ err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data);
+ if (err) {
+ pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
+ goto out2;
+ }
+
+ rc = -ENETUNREACH;
+ err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+ if (err) {
+ pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
+ goto out3;
+ }
+
+ rpcrdma_create_mrs(r_xprt);
+ return 0;
+
+out3:
+ rpcrdma_ep_destroy(ep, ia);
+out2:
+ rpcrdma_ia_close(ia);
+out1:
+ return rc;
+}
+
+static int
+rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep,
+ struct rpcrdma_ia *ia)
+{
+ struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr;
+ struct rdma_cm_id *id, *old;
+ int err, rc;
+
+ dprintk("RPC: %s: reconnecting...\n", __func__);
+
+ rpcrdma_ep_disconnect(ep, ia);
+
+ rc = -EHOSTUNREACH;
+ id = rpcrdma_create_id(r_xprt, ia, sap);
+ if (IS_ERR(id))
+ goto out;
+
+ /* As long as the new ID points to the same device as the
+ * old ID, we can reuse the transport's existing PD and all
+ * previously allocated MRs. Also, the same device means
+ * the transport's previous DMA mappings are still valid.
+ *
+ * This is a sanity check only. There should be no way these
+ * point to two different devices here.
+ */
+ old = id;
+ rc = -ENETUNREACH;
+ if (ia->ri_device != id->device) {
+ pr_err("rpcrdma: can't reconnect on different device!\n");
+ goto out_destroy;
+ }
+
+ err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
+ if (err) {
+ dprintk("RPC: %s: rdma_create_qp returned %d\n",
+ __func__, err);
+ goto out_destroy;
+ }
+
+ /* Atomically replace the transport's ID and QP. */
+ rc = 0;
+ old = ia->ri_id;
+ ia->ri_id = id;
+ rdma_destroy_qp(old);
+
+out_destroy:
+ rdma_destroy_id(old);
+out:
+ return rc;
+}
+
/*
* Connect unconnected endpoint.
*/
@@ -654,61 +794,30 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
rx_ia);
- struct rdma_cm_id *id, *old;
- struct sockaddr *sap;
unsigned int extras;
- int rc = 0;
+ int rc;
- if (ep->rep_connected != 0) {
retry:
- dprintk("RPC: %s: reconnecting...\n", __func__);
-
- rpcrdma_ep_disconnect(ep, ia);
-
- sap = (struct sockaddr *)&r_xprt->rx_data.addr;
- id = rpcrdma_create_id(r_xprt, ia, sap);
- if (IS_ERR(id)) {
- rc = -EHOSTUNREACH;
- goto out;
- }
- /* TEMP TEMP TEMP - fail if new device:
- * Deregister/remarshal *all* requests!
- * Close and recreate adapter, pd, etc!
- * Re-determine all attributes still sane!
- * More stuff I haven't thought of!
- * Rrrgh!
- */
- if (ia->ri_device != id->device) {
- printk("RPC: %s: can't reconnect on "
- "different device!\n", __func__);
- rpcrdma_destroy_id(id);
- rc = -ENETUNREACH;
- goto out;
- }
- /* END TEMP */
- rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
- if (rc) {
- dprintk("RPC: %s: rdma_create_qp failed %i\n",
- __func__, rc);
- rpcrdma_destroy_id(id);
- rc = -ENETUNREACH;
- goto out;
- }
-
- old = ia->ri_id;
- ia->ri_id = id;
-
- rdma_destroy_qp(old);
- rpcrdma_destroy_id(old);
- } else {
+ switch (ep->rep_connected) {
+ case 0:
dprintk("RPC: %s: connecting...\n", __func__);
rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
if (rc) {
dprintk("RPC: %s: rdma_create_qp failed %i\n",
__func__, rc);
- /* do not update ep->rep_connected */
- return -ENETUNREACH;
+ rc = -ENETUNREACH;
+ goto out_noupdate;
}
+ break;
+ case -ENODEV:
+ rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia);
+ if (rc)
+ goto out_noupdate;
+ break;
+ default:
+ rc = rpcrdma_ep_reconnect(r_xprt, ep, ia);
+ if (rc)
+ goto out;
}
ep->rep_connected = 0;
@@ -736,6 +845,8 @@ retry:
out:
if (rc)
ep->rep_connected = rc;
+
+out_noupdate:
return rc;
}
@@ -878,7 +989,6 @@ struct rpcrdma_rep *
rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_rep *rep;
int rc;
@@ -894,7 +1004,6 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
goto out_free;
}
- rep->rr_device = ia->ri_device;
rep->rr_cqe.done = rpcrdma_wc_receive;
rep->rr_rxprt = r_xprt;
INIT_WORK(&rep->rr_work, rpcrdma_reply_handler);
@@ -1037,6 +1146,7 @@ void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
cancel_delayed_work_sync(&buf->rb_recovery_worker);
+ cancel_delayed_work_sync(&buf->rb_refresh_worker);
while (!list_empty(&buf->rb_recv_bufs)) {
struct rpcrdma_rep *rep;
@@ -1081,7 +1191,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
out_nomws:
dprintk("RPC: %s: no MWs available\n", __func__);
- schedule_delayed_work(&buf->rb_refresh_worker, 0);
+ if (r_xprt->rx_ep.rep_connected != -ENODEV)
+ schedule_delayed_work(&buf->rb_refresh_worker, 0);
/* Allow the reply handler and refresh worker to run */
cond_resched();
@@ -1231,17 +1342,19 @@ rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
bool
__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
{
+ struct ib_device *device = ia->ri_device;
+
if (rb->rg_direction == DMA_NONE)
return false;
- rb->rg_iov.addr = ib_dma_map_single(ia->ri_device,
+ rb->rg_iov.addr = ib_dma_map_single(device,
(void *)rb->rg_base,
rdmab_length(rb),
rb->rg_direction);
- if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb)))
+ if (ib_dma_mapping_error(device, rdmab_addr(rb)))
return false;
- rb->rg_device = ia->ri_device;
+ rb->rg_device = device;
rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
return true;
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 171a35116de9..1d66acf1a723 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -69,6 +69,7 @@ struct rpcrdma_ia {
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
struct completion ri_done;
+ struct completion ri_remove_done;
int ri_async_rc;
unsigned int ri_max_segs;
unsigned int ri_max_frmr_depth;
@@ -78,10 +79,15 @@ struct rpcrdma_ia {
bool ri_reminv_expected;
bool ri_implicit_roundup;
enum ib_mr_type ri_mrtype;
+ unsigned long ri_flags;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
};
+enum {
+ RPCRDMA_IAF_REMOVING = 0,
+};
+
/*
* RDMA Endpoint -- one per transport instance
*/
@@ -164,6 +170,12 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
return (struct rpcrdma_msg *)rb->rg_base;
}
+static inline struct ib_device *
+rdmab_device(struct rpcrdma_regbuf *rb)
+{
+ return rb->rg_device;
+}
+
#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
/* To ensure a transport can always make forward progress,
@@ -209,7 +221,6 @@ struct rpcrdma_rep {
unsigned int rr_len;
int rr_wc_flags;
u32 rr_inv_rkey;
- struct ib_device *rr_device;
struct rpcrdma_xprt *rr_rxprt;
struct work_struct rr_work;
struct list_head rr_list;
@@ -380,7 +391,6 @@ struct rpcrdma_buffer {
spinlock_t rb_mwlock; /* protect rb_mws list */
struct list_head rb_mws;
struct list_head rb_all;
- char *rb_pool;
spinlock_t rb_lock; /* protect buf lists */
int rb_send_count, rb_recv_count;
@@ -497,10 +507,16 @@ struct rpcrdma_xprt {
* Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
extern int xprt_rdma_pad_optimize;
+/* This setting controls the hunt for a supported memory
+ * registration strategy.
+ */
+extern unsigned int xprt_rdma_memreg_strategy;
+
/*
* Interface Adapter calls - xprtrdma/verbs.c
*/
-int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
+int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr);
+void rpcrdma_ia_remove(struct rpcrdma_ia *ia);
void rpcrdma_ia_close(struct rpcrdma_ia *);
bool frwr_is_supported(struct rpcrdma_ia *);
bool fmr_is_supported(struct rpcrdma_ia *);
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 919981324171..9aed6fe1bf1a 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -106,7 +106,6 @@ __init int net_sysctl_init(void)
ret = register_pernet_subsys(&sysctl_pernet_ops);
if (ret)
goto out1;
- register_sysctl_root(&net_sysctl_root);
out:
return ret;
out1:
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 0d4f2f455a7c..1b92b72e812f 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -362,25 +362,25 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout)
return 0;
}
-#define tipc_wait_for_cond(sock_, timeout_, condition_) \
-({ \
- int rc_ = 0; \
- int done_ = 0; \
- \
- while (!(condition_) && !done_) { \
- struct sock *sk_ = sock->sk; \
- DEFINE_WAIT_FUNC(wait_, woken_wake_function); \
- \
- rc_ = tipc_sk_sock_err(sock_, timeout_); \
- if (rc_) \
- break; \
- prepare_to_wait(sk_sleep(sk_), &wait_, \
- TASK_INTERRUPTIBLE); \
- done_ = sk_wait_event(sk_, timeout_, \
- (condition_), &wait_); \
- remove_wait_queue(sk_sleep(sk_), &wait_); \
- } \
- rc_; \
+#define tipc_wait_for_cond(sock_, timeo_, condition_) \
+({ \
+ struct sock *sk_; \
+ int rc_; \
+ \
+ while ((rc_ = !(condition_))) { \
+ DEFINE_WAIT_FUNC(wait_, woken_wake_function); \
+ sk_ = (sock_)->sk; \
+ rc_ = tipc_sk_sock_err((sock_), timeo_); \
+ if (rc_) \
+ break; \
+ prepare_to_wait(sk_sleep(sk_), &wait_, TASK_INTERRUPTIBLE); \
+ release_sock(sk_); \
+ *(timeo_) = wait_woken(&wait_, TASK_INTERRUPTIBLE, *(timeo_)); \
+ sched_annotate_sleep(); \
+ lock_sock(sk_); \
+ remove_wait_queue(sk_sleep(sk_), &wait_); \
+ } \
+ rc_; \
})
/**
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 6f7f6757ceef..dfc8c51e4d74 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1540,8 +1540,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
long timeout;
int err;
struct vsock_transport_send_notify_data send_data;
-
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
sk = sock->sk;
vsk = vsock_sk(sk);
@@ -1584,11 +1583,10 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
if (err < 0)
goto out;
-
while (total_written < len) {
ssize_t written;
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ add_wait_queue(sk_sleep(sk), &wait);
while (vsock_stream_has_space(vsk) == 0 &&
sk->sk_err == 0 &&
!(sk->sk_shutdown & SEND_SHUTDOWN) &&
@@ -1597,33 +1595,30 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
/* Don't wait for non-blocking sockets. */
if (timeout == 0) {
err = -EAGAIN;
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
goto out_err;
}
err = transport->notify_send_pre_block(vsk, &send_data);
if (err < 0) {
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
goto out_err;
}
release_sock(sk);
- timeout = schedule_timeout(timeout);
+ timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout);
lock_sock(sk);
if (signal_pending(current)) {
err = sock_intr_errno(timeout);
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
goto out_err;
} else if (timeout == 0) {
err = -EAGAIN;
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
goto out_err;
}
-
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
}
- finish_wait(sk_sleep(sk), &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
/* These checks occur both as part of and after the loop
* conditional since we need to check before and after
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 9dffe0282ad4..403d86e80162 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -576,9 +576,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
vsock->vdev = vdev;
- ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
- vsock->vqs, callbacks, names,
- NULL);
+ ret = virtio_find_vqs(vsock->vdev, VSOCK_VQ_MAX,
+ vsock->vqs, callbacks, names,
+ NULL);
if (ret < 0)
goto out;
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 8b911c29860e..5a1a98df3499 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1791,32 +1791,40 @@ void x25_kill_by_neigh(struct x25_neigh *nb)
static int __init x25_init(void)
{
- int rc = proto_register(&x25_proto, 0);
+ int rc;
- if (rc != 0)
+ rc = proto_register(&x25_proto, 0);
+ if (rc)
goto out;
rc = sock_register(&x25_family_ops);
- if (rc != 0)
+ if (rc)
goto out_proto;
dev_add_pack(&x25_packet_type);
rc = register_netdevice_notifier(&x25_dev_notifier);
- if (rc != 0)
+ if (rc)
goto out_sock;
- pr_info("Linux Version 0.2\n");
+ rc = x25_register_sysctl();
+ if (rc)
+ goto out_dev;
- x25_register_sysctl();
rc = x25_proc_init();
- if (rc != 0)
- goto out_dev;
+ if (rc)
+ goto out_sysctl;
+
+ pr_info("Linux Version 0.2\n");
+
out:
return rc;
+out_sysctl:
+ x25_unregister_sysctl();
out_dev:
unregister_netdevice_notifier(&x25_dev_notifier);
out_sock:
+ dev_remove_pack(&x25_packet_type);
sock_unregister(AF_X25);
out_proto:
proto_unregister(&x25_proto);
diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c
index a06dfe143c67..ba078c85f0a1 100644
--- a/net/x25/sysctl_net_x25.c
+++ b/net/x25/sysctl_net_x25.c
@@ -73,9 +73,12 @@ static struct ctl_table x25_table[] = {
{ },
};
-void __init x25_register_sysctl(void)
+int __init x25_register_sysctl(void)
{
x25_table_header = register_net_sysctl(&init_net, "net/x25", x25_table);
+ if (!x25_table_header)
+ return -ENOMEM;
+ return 0;
}
void x25_unregister_sysctl(void)
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 8ec8a3fcf8d4..574e6f32f94f 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -170,7 +170,7 @@ static int xfrm_dev_feat_change(struct net_device *dev)
static int xfrm_dev_down(struct net_device *dev)
{
- if (dev->hw_features & NETIF_F_HW_ESP)
+ if (dev->features & NETIF_F_HW_ESP)
xfrm_dev_state_flush(dev_net(dev), dev, true);
xfrm_garbage_collect(dev_net(dev));
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index b00a1d5a7f52..ed4e52d95172 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1797,43 +1797,6 @@ free_dst:
goto out;
}
-#ifdef CONFIG_XFRM_SUB_POLICY
-static int xfrm_dst_alloc_copy(void **target, const void *src, int size)
-{
- if (!*target) {
- *target = kmalloc(size, GFP_ATOMIC);
- if (!*target)
- return -ENOMEM;
- }
-
- memcpy(*target, src, size);
- return 0;
-}
-#endif
-
-static int xfrm_dst_update_parent(struct dst_entry *dst,
- const struct xfrm_selector *sel)
-{
-#ifdef CONFIG_XFRM_SUB_POLICY
- struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
- return xfrm_dst_alloc_copy((void **)&(xdst->partner),
- sel, sizeof(*sel));
-#else
- return 0;
-#endif
-}
-
-static int xfrm_dst_update_origin(struct dst_entry *dst,
- const struct flowi *fl)
-{
-#ifdef CONFIG_XFRM_SUB_POLICY
- struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
- return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl));
-#else
- return 0;
-#endif
-}
-
static int xfrm_expand_policies(const struct flowi *fl, u16 family,
struct xfrm_policy **pols,
int *num_pols, int *num_xfrms)
@@ -1905,16 +1868,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
xdst = (struct xfrm_dst *)dst;
xdst->num_xfrms = err;
- if (num_pols > 1)
- err = xfrm_dst_update_parent(dst, &pols[1]->selector);
- else
- err = xfrm_dst_update_origin(dst, fl);
- if (unlikely(err)) {
- dst_free(dst);
- XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR);
- return ERR_PTR(err);
- }
-
xdst->num_pols = num_pols;
memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
xdst->policy_genid = atomic_read(&pols[0]->genid);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index fc3c5aa38754..2e291bc5f1fc 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1383,6 +1383,8 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig)
x->curlft.add_time = orig->curlft.add_time;
x->km.state = orig->km.state;
x->km.seq = orig->km.seq;
+ x->replay = orig->replay;
+ x->preplay = orig->preplay;
return x;