diff options
Diffstat (limited to 'net')
114 files changed, 2400 insertions, 1511 deletions
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 71e85643b3f9..6ad3e043c617 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -454,8 +454,8 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev, goto error_xenbus; } priv->tag = xenbus_read(xbt, dev->nodename, "tag", NULL); - if (!priv->tag) { - ret = -EINVAL; + if (IS_ERR(priv->tag)) { + ret = PTR_ERR(priv->tag); goto error_xenbus; } ret = xenbus_transaction_end(xbt, 0); @@ -525,7 +525,7 @@ static struct xenbus_driver xen_9pfs_front_driver = { .otherend_changed = xen_9pfs_front_changed, }; -int p9_trans_xen_init(void) +static int p9_trans_xen_init(void) { if (!xen_domain()) return -ENODEV; @@ -537,7 +537,7 @@ int p9_trans_xen_init(void) } module_init(p9_trans_xen_init); -void p9_trans_xen_exit(void) +static void p9_trans_xen_exit(void) { v9fs_unregister_trans(&p9_xen_trans); return xenbus_unregister_driver(&xen_9pfs_front_driver); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index c5ce7745b230..574f78824d8a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -835,6 +835,13 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[]) return -EPROTONOSUPPORT; } } + + if (data[IFLA_BR_VLAN_DEFAULT_PVID]) { + __u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]); + + if (defpvid >= VLAN_VID_MASK) + return -EINVAL; + } #endif return 0; diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 08341d2aa9c9..0db8102995a5 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -179,6 +179,7 @@ static void br_stp_start(struct net_bridge *br) br_debug(br, "using kernel STP\n"); /* To start timers on any ports left in blocking */ + mod_timer(&br->hello_timer, jiffies + br->hello_time); br_port_state_selection(br); } diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c index c98b3e5c140a..60b6fe277a8b 100644 --- a/net/bridge/br_stp_timer.c +++ b/net/bridge/br_stp_timer.c @@ -40,7 +40,7 @@ static void br_hello_timer_expired(unsigned long arg) if (br->dev->flags & IFF_UP) { br_config_bpdu_generation(br); - if (br->stp_enabled != BR_USER_STP) + if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time)); } diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c index 5929309beaa1..db85230e49c3 100644 --- a/net/bridge/netfilter/ebt_arpreply.c +++ b/net/bridge/netfilter/ebt_arpreply.c @@ -68,6 +68,9 @@ static int ebt_arpreply_tg_check(const struct xt_tgchk_param *par) if (e->ethproto != htons(ETH_P_ARP) || e->invflags & EBT_IPROTO) return -EINVAL; + if (ebt_invalid_target(info->target)) + return -EINVAL; + return 0; } diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 9ec0c9f908fa..9c6e619f452b 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1373,7 +1373,8 @@ static inline int ebt_obj_to_user(char __user *um, const char *_name, strlcpy(name, _name, sizeof(name)); if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) || put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) || - xt_data_to_user(um + entrysize, data, usersize, datasize)) + xt_data_to_user(um + entrysize, data, usersize, datasize, + XT_ALIGN(datasize))) return -EFAULT; return 0; @@ -1658,7 +1659,8 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr, if (match->compat_to_user(cm->data, m->data)) return -EFAULT; } else { - if (xt_data_to_user(cm->data, m->data, match->usersize, msize)) + if (xt_data_to_user(cm->data, m->data, match->usersize, msize, + COMPAT_XT_ALIGN(msize))) return -EFAULT; } @@ -1687,7 +1689,8 @@ static int compat_target_to_user(struct ebt_entry_target *t, if (target->compat_to_user(cm->data, t->data)) return -EFAULT; } else { - if (xt_data_to_user(cm->data, t->data, target->usersize, tsize)) + if (xt_data_to_user(cm->data, t->data, target->usersize, tsize, + COMPAT_XT_ALIGN(tsize))) return -EFAULT; } diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 108533859a53..4fd02831beed 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -45,6 +45,17 @@ bool libceph_compatible(void *data) } EXPORT_SYMBOL(libceph_compatible); +static int param_get_supported_features(char *buffer, + const struct kernel_param *kp) +{ + return sprintf(buffer, "0x%llx", CEPH_FEATURES_SUPPORTED_DEFAULT); +} +static const struct kernel_param_ops param_ops_supported_features = { + .get = param_get_supported_features, +}; +module_param_cb(supported_features, ¶m_ops_supported_features, NULL, + S_IRUGO); + /* * find filename portion of a path (/foo/bar/baz -> baz) */ @@ -187,7 +198,7 @@ void *ceph_kvmalloc(size_t size, gfp_t flags) return ptr; } - return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, flags, PAGE_KERNEL); } @@ -596,9 +607,7 @@ EXPORT_SYMBOL(ceph_client_gid); /* * create a fresh client instance */ -struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, - u64 supported_features, - u64 required_features) +struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) { struct ceph_client *client; struct ceph_entity_addr *myaddr = NULL; @@ -615,14 +624,12 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, init_waitqueue_head(&client->auth_wq); client->auth_err = 0; - if (!ceph_test_opt(client, NOMSGAUTH)) - required_features |= CEPH_FEATURE_MSG_AUTH; - client->extra_mon_dispatch = NULL; - client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT | - supported_features; - client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT | - required_features; + client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT; + client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT; + + if (!ceph_test_opt(client, NOMSGAUTH)) + client->required_features |= CEPH_FEATURE_MSG_AUTH; /* msgr */ if (ceph_test_opt(client, MYIP)) diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c index b9233b990399..08ada893f01e 100644 --- a/net/ceph/cls_lock_client.c +++ b/net/ceph/cls_lock_client.c @@ -179,6 +179,57 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_cls_break_lock); +int ceph_cls_set_cookie(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + char *lock_name, u8 type, char *old_cookie, + char *tag, char *new_cookie) +{ + int cookie_op_buf_size; + int name_len = strlen(lock_name); + int old_cookie_len = strlen(old_cookie); + int tag_len = strlen(tag); + int new_cookie_len = strlen(new_cookie); + void *p, *end; + struct page *cookie_op_page; + int ret; + + cookie_op_buf_size = name_len + sizeof(__le32) + + old_cookie_len + sizeof(__le32) + + tag_len + sizeof(__le32) + + new_cookie_len + sizeof(__le32) + + sizeof(u8) + CEPH_ENCODING_START_BLK_LEN; + if (cookie_op_buf_size > PAGE_SIZE) + return -E2BIG; + + cookie_op_page = alloc_page(GFP_NOIO); + if (!cookie_op_page) + return -ENOMEM; + + p = page_address(cookie_op_page); + end = p + cookie_op_buf_size; + + /* encode cls_lock_set_cookie_op struct */ + ceph_start_encoding(&p, 1, 1, + cookie_op_buf_size - CEPH_ENCODING_START_BLK_LEN); + ceph_encode_string(&p, end, lock_name, name_len); + ceph_encode_8(&p, type); + ceph_encode_string(&p, end, old_cookie, old_cookie_len); + ceph_encode_string(&p, end, tag, tag_len); + ceph_encode_string(&p, end, new_cookie, new_cookie_len); + + dout("%s lock_name %s type %d old_cookie %s tag %s new_cookie %s\n", + __func__, lock_name, type, old_cookie, tag, new_cookie); + ret = ceph_osdc_call(osdc, oid, oloc, "lock", "set_cookie", + CEPH_OSD_FLAG_WRITE, cookie_op_page, + cookie_op_buf_size, NULL, NULL); + + dout("%s: status %d\n", __func__, ret); + __free_page(cookie_op_page); + return ret; +} +EXPORT_SYMBOL(ceph_cls_set_cookie); + void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers) { int i; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index c62b2b029a6e..71ba13927b3d 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -62,7 +62,8 @@ static int osdmap_show(struct seq_file *s, void *p) return 0; down_read(&osdc->lock); - seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags); + seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch, + osdc->epoch_barrier, map->flags); for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pi = @@ -177,9 +178,7 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req) seq_printf(s, "%llu\t", req->r_tid); dump_target(s, &req->r_t); - seq_printf(s, "\t%d\t%u'%llu", req->r_attempts, - le32_to_cpu(req->r_replay_version.epoch), - le64_to_cpu(req->r_replay_version.version)); + seq_printf(s, "\t%d", req->r_attempts); for (i = 0; i < req->r_num_ops; i++) { struct ceph_osd_req_op *op = &req->r_ops[i]; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f76bb3332613..5766a6c896c4 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1386,8 +1386,9 @@ static void prepare_write_keepalive(struct ceph_connection *con) dout("prepare_write_keepalive %p\n", con); con_out_kvec_reset(con); if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { - struct timespec now = CURRENT_TIME; + struct timespec now; + ktime_get_real_ts(&now); con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); ceph_encode_timespec(&con->out_temp_keepalive2, &now); con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), @@ -3176,8 +3177,9 @@ bool ceph_con_keepalive_expired(struct ceph_connection *con, { if (interval > 0 && (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { - struct timespec now = CURRENT_TIME; + struct timespec now; struct timespec ts; + ktime_get_real_ts(&now); jiffies_to_timespec(interval, &ts); ts = timespec_add(con->last_keepalive_ack, ts); return timespec_compare(&now, &ts) >= 0; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index e15ea9e4c495..924f07c36ddb 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -961,6 +961,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, truncate_size, truncate_seq); } + req->r_abort_on_full = true; req->r_flags = flags; req->r_base_oloc.pool = layout->pool_id; req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); @@ -1005,7 +1006,7 @@ static bool osd_registered(struct ceph_osd *osd) */ static void osd_init(struct ceph_osd *osd) { - atomic_set(&osd->o_ref, 1); + refcount_set(&osd->o_ref, 1); RB_CLEAR_NODE(&osd->o_node); osd->o_requests = RB_ROOT; osd->o_linger_requests = RB_ROOT; @@ -1050,9 +1051,9 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) static struct ceph_osd *get_osd(struct ceph_osd *osd) { - if (atomic_inc_not_zero(&osd->o_ref)) { - dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1, - atomic_read(&osd->o_ref)); + if (refcount_inc_not_zero(&osd->o_ref)) { + dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1, + refcount_read(&osd->o_ref)); return osd; } else { dout("get_osd %p FAIL\n", osd); @@ -1062,9 +1063,9 @@ static struct ceph_osd *get_osd(struct ceph_osd *osd) static void put_osd(struct ceph_osd *osd) { - dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), - atomic_read(&osd->o_ref) - 1); - if (atomic_dec_and_test(&osd->o_ref)) { + dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref), + refcount_read(&osd->o_ref) - 1); + if (refcount_dec_and_test(&osd->o_ref)) { osd_cleanup(osd); kfree(osd); } @@ -1297,8 +1298,9 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, __pool_full(pi); WARN_ON(pi->id != t->base_oloc.pool); - return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || - (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); + return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || + ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || + (osdc->osdmap->epoch < osdc->epoch_barrier); } enum calc_target_result { @@ -1503,9 +1505,10 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) ceph_encode_32(&p, req->r_flags); ceph_encode_timespec(p, &req->r_mtime); p += sizeof(struct ceph_timespec); - /* aka reassert_version */ - memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version)); - p += sizeof(req->r_replay_version); + + /* reassert_version */ + memset(p, 0, sizeof(struct ceph_eversion)); + p += sizeof(struct ceph_eversion); /* oloc */ ceph_start_encoding(&p, 5, 4, @@ -1626,6 +1629,7 @@ static void maybe_request_map(struct ceph_osd_client *osdc) ceph_monc_renew_subs(&osdc->client->monc); } +static void complete_request(struct ceph_osd_request *req, int err); static void send_map_check(struct ceph_osd_request *req); static void __submit_request(struct ceph_osd_request *req, bool wrlocked) @@ -1635,6 +1639,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) enum calc_target_result ct_res; bool need_send = false; bool promoted = false; + bool need_abort = false; WARN_ON(req->r_tid); dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); @@ -1650,8 +1655,13 @@ again: goto promote; } - if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && - ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { + if (osdc->osdmap->epoch < osdc->epoch_barrier) { + dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch, + osdc->epoch_barrier); + req->r_t.paused = true; + maybe_request_map(osdc); + } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { dout("req %p pausewr\n", req); req->r_t.paused = true; maybe_request_map(osdc); @@ -1669,6 +1679,8 @@ again: pr_warn_ratelimited("FULL or reached pool quota\n"); req->r_t.paused = true; maybe_request_map(osdc); + if (req->r_abort_on_full) + need_abort = true; } else if (!osd_homeless(osd)) { need_send = true; } else { @@ -1685,6 +1697,8 @@ again: link_request(osd, req); if (need_send) send_request(req); + else if (need_abort) + complete_request(req, -ENOSPC); mutex_unlock(&osd->lock); if (ct_res == CALC_TARGET_POOL_DNE) @@ -1799,6 +1813,97 @@ static void abort_request(struct ceph_osd_request *req, int err) complete_request(req, err); } +static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) +{ + if (likely(eb > osdc->epoch_barrier)) { + dout("updating epoch_barrier from %u to %u\n", + osdc->epoch_barrier, eb); + osdc->epoch_barrier = eb; + /* Request map if we're not to the barrier yet */ + if (eb > osdc->osdmap->epoch) + maybe_request_map(osdc); + } +} + +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) +{ + down_read(&osdc->lock); + if (unlikely(eb > osdc->epoch_barrier)) { + up_read(&osdc->lock); + down_write(&osdc->lock); + update_epoch_barrier(osdc, eb); + up_write(&osdc->lock); + } else { + up_read(&osdc->lock); + } +} +EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier); + +/* + * Drop all pending requests that are stalled waiting on a full condition to + * clear, and complete them with ENOSPC as the return code. Set the + * osdc->epoch_barrier to the latest map epoch that we've seen if any were + * cancelled. + */ +static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) +{ + struct rb_node *n; + bool victims = false; + + dout("enter abort_on_full\n"); + + if (!ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && !have_pool_full(osdc)) + goto out; + + /* Scan list and see if there is anything to abort */ + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + struct rb_node *m; + + m = rb_first(&osd->o_requests); + while (m) { + struct ceph_osd_request *req = rb_entry(m, + struct ceph_osd_request, r_node); + m = rb_next(m); + + if (req->r_abort_on_full) { + victims = true; + break; + } + } + if (victims) + break; + } + + if (!victims) + goto out; + + /* + * Update the barrier to current epoch if it's behind that point, + * since we know we have some calls to be aborted in the tree. + */ + update_epoch_barrier(osdc, osdc->osdmap->epoch); + + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + struct rb_node *m; + + m = rb_first(&osd->o_requests); + while (m) { + struct ceph_osd_request *req = rb_entry(m, + struct ceph_osd_request, r_node); + m = rb_next(m); + + if (req->r_abort_on_full && + (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || + pool_full(osdc, req->r_t.target_oloc.pool))) + abort_request(req, -ENOSPC); + } + } +out: + dout("return abort_on_full barrier=%u\n", osdc->epoch_barrier); +} + static void check_pool_dne(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; @@ -3252,11 +3357,13 @@ done: pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc); - if (was_pauserd || was_pausewr || pauserd || pausewr) + if (was_pauserd || was_pausewr || pauserd || pausewr || + osdc->osdmap->epoch < osdc->epoch_barrier) maybe_request_map(osdc); kick_requests(osdc, &need_resend, &need_resend_linger); + ceph_osdc_abort_on_full(osdc); ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, osdc->osdmap->epoch); up_write(&osdc->lock); @@ -3574,7 +3681,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, ceph_oid_copy(&lreq->t.base_oid, oid); ceph_oloc_copy(&lreq->t.base_oloc, oloc); lreq->t.flags = CEPH_OSD_FLAG_WRITE; - lreq->mtime = CURRENT_TIME; + ktime_get_real_ts(&lreq->mtime); lreq->reg_req = alloc_linger_request(lreq); if (!lreq->reg_req) { @@ -3632,7 +3739,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); req->r_flags = CEPH_OSD_FLAG_WRITE; - req->r_mtime = CURRENT_TIME; + ktime_get_real_ts(&req->r_mtime); osd_req_op_watch_init(req, 0, lreq->linger_id, CEPH_OSD_WATCH_OP_UNWATCH); @@ -4126,7 +4233,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) close_osd(osd); } up_write(&osdc->lock); - WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1); + WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1); osd_cleanup(&osdc->homeless_osd); WARN_ON(!list_empty(&osdc->osd_lru)); diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index 6864007e64fc..ce09f73be759 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c @@ -16,7 +16,7 @@ static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) void ceph_pagelist_release(struct ceph_pagelist *pl) { - if (!atomic_dec_and_test(&pl->refcnt)) + if (!refcount_dec_and_test(&pl->refcnt)) return; ceph_pagelist_unmap_tail(pl); while (!list_empty(&pl->head)) { diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c index 705414e78ae0..e14a5d038656 100644 --- a/net/ceph/snapshot.c +++ b/net/ceph/snapshot.c @@ -49,7 +49,7 @@ struct ceph_snap_context *ceph_create_snap_context(u32 snap_count, if (!snapc) return NULL; - atomic_set(&snapc->nref, 1); + refcount_set(&snapc->nref, 1); snapc->num_snaps = snap_count; return snapc; @@ -59,7 +59,7 @@ EXPORT_SYMBOL(ceph_create_snap_context); struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc) { if (sc) - atomic_inc(&sc->nref); + refcount_inc(&sc->nref); return sc; } EXPORT_SYMBOL(ceph_get_snap_context); @@ -68,7 +68,7 @@ void ceph_put_snap_context(struct ceph_snap_context *sc) { if (!sc) return; - if (atomic_dec_and_test(&sc->nref)) { + if (refcount_dec_and_test(&sc->nref)) { /*printk(" deleting snap_context %p\n", sc);*/ kfree(sc); } diff --git a/net/core/dev.c b/net/core/dev.c index d07aa5ffb511..fca407b4a6ea 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -81,6 +81,7 @@ #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/mutex.h> #include <linux/string.h> #include <linux/mm.h> @@ -4235,7 +4236,7 @@ static int __netif_receive_skb(struct sk_buff *skb) int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; /* * PFMEMALLOC skbs are special, they should @@ -4246,9 +4247,9 @@ static int __netif_receive_skb(struct sk_buff *skb) * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); ret = __netif_receive_skb_core(skb, true); - current_restore_flags(pflags, PF_MEMALLOC); + memalloc_noreclaim_restore(noreclaim_flag); } else ret = __netif_receive_skb_core(skb, false); @@ -6851,6 +6852,32 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); +bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op) +{ + struct netdev_xdp xdp; + + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG; + + /* Query must always succeed. */ + WARN_ON(xdp_op(dev, &xdp) < 0); + return xdp.prog_attached; +} + +static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op, + struct netlink_ext_ack *extack, + struct bpf_prog *prog) +{ + struct netdev_xdp xdp; + + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_SETUP_PROG; + xdp.extack = extack; + xdp.prog = prog; + + return xdp_op(dev, &xdp); +} + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device @@ -6863,41 +6890,34 @@ EXPORT_SYMBOL(dev_change_proto_down); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags) { - int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp); const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; - struct netdev_xdp xdp; + xdp_op_t xdp_op, xdp_chk; int err; ASSERT_RTNL(); - xdp_op = ops->ndo_xdp; + xdp_op = xdp_chk = ops->ndo_xdp; + if (!xdp_op && (flags & XDP_FLAGS_DRV_MODE)) + return -EOPNOTSUPP; if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE)) xdp_op = generic_xdp_install; + if (xdp_op == xdp_chk) + xdp_chk = generic_xdp_install; if (fd >= 0) { - if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG; - - err = xdp_op(dev, &xdp); - if (err < 0) - return err; - if (xdp.prog_attached) - return -EBUSY; - } + if (xdp_chk && __dev_xdp_attached(dev, xdp_chk)) + return -EEXIST; + if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && + __dev_xdp_attached(dev, xdp_op)) + return -EBUSY; prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); if (IS_ERR(prog)) return PTR_ERR(prog); } - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_SETUP_PROG; - xdp.extack = extack; - xdp.prog = prog; - - err = xdp_op(dev, &xdp); + err = dev_xdp_install(dev, xdp_op, extack, prog); if (err < 0 && prog) bpf_prog_put(prog); @@ -7264,12 +7284,10 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); - rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!rx) { - rx = vzalloc(sz); - if (!rx) - return -ENOMEM; - } + rx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); + if (!rx) + return -ENOMEM; + dev->_rx = rx; for (i = 0; i < count; i++) @@ -7306,12 +7324,10 @@ static int netif_alloc_netdev_queues(struct net_device *dev) if (count < 1 || count > 0xffff) return -EINVAL; - tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!tx) { - tx = vzalloc(sz); - if (!tx) - return -ENOMEM; - } + tx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); + if (!tx) + return -ENOMEM; + dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); @@ -7845,9 +7861,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!p) - p = vzalloc(alloc_size); + p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_REPEAT); if (!p) return NULL; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 58b0bcc125b5..d274f81fcc2c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1132,10 +1132,6 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, lladdr = neigh->ha; } - if (new & NUD_CONNECTED) - neigh->confirmed = jiffies; - neigh->updated = jiffies; - /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ @@ -1157,6 +1153,16 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } } + /* Update timestamps only once we know we will make a change to the + * neighbour entry. Otherwise we risk to move the locktime window with + * noop updates and ignore relevant ARP updates. + */ + if (new != old || lladdr != neigh->ha) { + if (new & NUD_CONNECTED) + neigh->confirmed = jiffies; + neigh->updated = jiffies; + } + if (new != old) { neigh_del_timer(neigh); if (new & NUD_PROBE) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bcb0f610ee42..9e2c0a7cb325 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -899,8 +899,7 @@ static size_t rtnl_port_size(const struct net_device *dev, static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ - nla_total_size(1) + /* XDP_ATTACHED */ - nla_total_size(4); /* XDP_FLAGS */ + nla_total_size(1); /* XDP_ATTACHED */ return xdp_size; } @@ -1247,37 +1246,34 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) return 0; } +static u8 rtnl_xdp_attached_mode(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + ASSERT_RTNL(); + + if (rcu_access_pointer(dev->xdp_prog)) + return XDP_ATTACHED_SKB; + if (ops->ndo_xdp && __dev_xdp_attached(dev, ops->ndo_xdp)) + return XDP_ATTACHED_DRV; + + return XDP_ATTACHED_NONE; +} + static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *xdp; - u32 xdp_flags = 0; - u8 val = 0; int err; xdp = nla_nest_start(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; - if (rcu_access_pointer(dev->xdp_prog)) { - xdp_flags = XDP_FLAGS_SKB_MODE; - val = 1; - } else if (dev->netdev_ops->ndo_xdp) { - struct netdev_xdp xdp_op = {}; - - xdp_op.command = XDP_QUERY_PROG; - err = dev->netdev_ops->ndo_xdp(dev, &xdp_op); - if (err) - goto err_cancel; - val = xdp_op.prog_attached; - } - err = nla_put_u8(skb, IFLA_XDP_ATTACHED, val); + + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, + rtnl_xdp_attached_mode(dev)); if (err) goto err_cancel; - if (xdp_flags) { - err = nla_put_u32(skb, IFLA_XDP_FLAGS, xdp_flags); - if (err) - goto err_cancel; - } nla_nest_end(skb, xdp); return 0; @@ -1631,13 +1627,13 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, 0, flags, ext_filter_mask); - /* If we ran out of room on the first message, - * we're in trouble - */ - WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); - if (err < 0) - goto out; + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: @@ -1645,10 +1641,12 @@ cont: } } out: + err = skb->len; +out_err: cb->args[1] = idx; cb->args[0] = h; - return skb->len; + return err; } int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, @@ -2199,6 +2197,11 @@ static int do_setlink(const struct sk_buff *skb, err = -EINVAL; goto errout; } + if ((xdp_flags & XDP_FLAGS_SKB_MODE) && + (xdp_flags & XDP_FLAGS_DRV_MODE)) { + err = -EINVAL; + goto errout; + } } if (xdp[IFLA_XDP_FD]) { @@ -3228,8 +3231,11 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int err = 0; int fidx = 0; - if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, - IFLA_MAX, ifla_policy, NULL) == 0) { + err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, NULL); + if (err < 0) { + return -EINVAL; + } else if (err == 0) { if (tb[IFLA_MASTER]) br_idx = nla_get_u32(tb[IFLA_MASTER]); } @@ -3452,8 +3458,12 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) err = br_dev->netdev_ops->ndo_bridge_getlink( skb, portid, seq, dev, filter_mask, NLM_F_MULTI); - if (err < 0 && err != -EOPNOTSUPP) - break; + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } } idx++; } @@ -3464,16 +3474,22 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) seq, dev, filter_mask, NLM_F_MULTI); - if (err < 0 && err != -EOPNOTSUPP) - break; + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } } idx++; } } + err = skb->len; +out_err: rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static inline size_t bridge_nlmsg_size(void) diff --git a/net/core/sock.c b/net/core/sock.c index b5baeb9cb0fb..727f924b7f91 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -102,6 +102,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> @@ -138,10 +139,7 @@ #include <trace/events/sock.h> -#ifdef CONFIG_INET #include <net/tcp.h> -#endif - #include <net/busy_poll.h> static DEFINE_MUTEX(proto_list_mutex); @@ -372,14 +370,14 @@ EXPORT_SYMBOL_GPL(sk_clear_memalloc); int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int ret; - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; /* these should have been dropped before queueing */ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); ret = sk->sk_backlog_rcv(sk, skb); - current_restore_flags(pflags, PF_MEMALLOC); + memalloc_noreclaim_restore(noreclaim_flag); return ret; } @@ -1802,28 +1800,24 @@ EXPORT_SYMBOL(skb_set_owner_w); * delay queue. We want to allow the owner socket to send more * packets, as if they were already TX completed by a typical driver. * But we also want to keep skb->sk set because some packet schedulers - * rely on it (sch_fq for example). So we set skb->truesize to a small - * amount (1) and decrease sk_wmem_alloc accordingly. + * rely on it (sch_fq for example). */ void skb_orphan_partial(struct sk_buff *skb) { - /* If this skb is a TCP pure ACK or already went here, - * we have nothing to do. 2 is already a very small truesize. - */ - if (skb->truesize <= 2) + if (skb_is_tcp_pure_ack(skb)) return; - /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, - * so we do not completely orphan skb, but transfert all - * accounted bytes but one, to avoid unexpected reorders. - */ if (skb->destructor == sock_wfree #ifdef CONFIG_INET || skb->destructor == tcp_wfree #endif ) { - atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); - skb->truesize = 1; + struct sock *sk = skb->sk; + + if (atomic_inc_not_zero(&sk->sk_refcnt)) { + atomic_sub(skb->truesize, &sk->sk_wmem_alloc); + skb->destructor = sock_efree; + } } else { skb_orphan(skb); } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b99168b0fabf..f75482bdee9a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -951,7 +951,7 @@ static struct proto dccp_v4_prot = { .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, .rsk_prot = &dccp_request_sock_ops, .twsk_prot = &dccp_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d9b6a4e403e7..992621172220 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -426,6 +426,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, newsk->sk_backlog_rcv = dccp_v4_do_rcv; newnp->pktoptions = NULL; newnp->opt = NULL; + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; @@ -490,6 +493,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); @@ -1014,7 +1020,7 @@ static struct proto dccp_v6_prot = { .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, .rsk_prot = &dccp6_request_sock_ops, .twsk_prot = &dccp6_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 9afa2a5030b2..405483a07efc 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -2361,7 +2361,8 @@ MODULE_AUTHOR("Linux DECnet Project Team"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_DECnet); -static char banner[] __initdata = KERN_INFO "NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n"; +static const char banner[] __initconst = KERN_INFO +"NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n"; static int __init decnet_init(void) { diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 482730cd8a56..eeb5fc561f80 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -110,7 +110,7 @@ struct neigh_table dn_neigh_table = { static int dn_neigh_construct(struct neighbour *neigh) { struct net_device *dev = neigh->dev; - struct dn_neigh *dn = (struct dn_neigh *)neigh; + struct dn_neigh *dn = container_of(neigh, struct dn_neigh, n); struct dn_dev *dn_db; struct neigh_parms *parms; @@ -339,7 +339,7 @@ int dn_to_neigh_output(struct net *net, struct sock *sk, struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *) dst; struct neighbour *neigh = rt->n; - struct dn_neigh *dn = (struct dn_neigh *)neigh; + struct dn_neigh *dn = container_of(neigh, struct dn_neigh, n); struct dn_dev *dn_db; bool use_long; @@ -391,7 +391,7 @@ int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb) neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1); - dn = (struct dn_neigh *)neigh; + dn = container_of(neigh, struct dn_neigh, n); if (neigh) { write_lock(&neigh->lock); @@ -451,7 +451,7 @@ int dn_neigh_endnode_hello(struct net *net, struct sock *sk, struct sk_buff *skb neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1); - dn = (struct dn_neigh *)neigh; + dn = container_of(neigh, struct dn_neigh, n); if (neigh) { write_lock(&neigh->lock); @@ -510,7 +510,7 @@ static void neigh_elist_cb(struct neighbour *neigh, void *_info) if (neigh->dev != s->dev) return; - dn = (struct dn_neigh *) neigh; + dn = container_of(neigh, struct dn_neigh, n); if (!(dn->flags & (DN_NDFLAG_R1|DN_NDFLAG_R2))) return; @@ -549,7 +549,7 @@ int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n) static inline void dn_neigh_format_entry(struct seq_file *seq, struct neighbour *n) { - struct dn_neigh *dn = (struct dn_neigh *) n; + struct dn_neigh *dn = container_of(n, struct dn_neigh, n); char buf[DN_ASCBUF_LEN]; read_lock(&n->lock); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 0937b34c27ca..ae96e6f3e0cb 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -641,6 +641,32 @@ void arp_xmit(struct sk_buff *skb) } EXPORT_SYMBOL(arp_xmit); +static bool arp_is_garp(struct net *net, struct net_device *dev, + int *addr_type, __be16 ar_op, + __be32 sip, __be32 tip, + unsigned char *sha, unsigned char *tha) +{ + bool is_garp = tip == sip; + + /* Gratuitous ARP _replies_ also require target hwaddr to be + * the same as source. + */ + if (is_garp && ar_op == htons(ARPOP_REPLY)) + is_garp = + /* IPv4 over IEEE 1394 doesn't provide target + * hardware address field in its ARP payload. + */ + tha && + !memcmp(tha, sha, dev->addr_len); + + if (is_garp) { + *addr_type = inet_addr_type_dev_table(net, dev, sip); + if (*addr_type != RTN_UNICAST) + is_garp = false; + } + return is_garp; +} + /* * Process an arp request. */ @@ -653,6 +679,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) unsigned char *arp_ptr; struct rtable *rt; unsigned char *sha; + unsigned char *tha = NULL; __be32 sip, tip; u16 dev_type = dev->type; int addr_type; @@ -724,6 +751,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) break; #endif default: + tha = arp_ptr; arp_ptr += dev->addr_len; } memcpy(&tip, arp_ptr, 4); @@ -835,19 +863,25 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); - if (IN_DEV_ARP_ACCEPT(in_dev)) { - unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip); + if (n || IN_DEV_ARP_ACCEPT(in_dev)) { + addr_type = -1; + is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, + sip, tip, sha, tha); + } + if (IN_DEV_ARP_ACCEPT(in_dev)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ - is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && - addr_type == RTN_UNICAST; - if (!n && - ((arp->ar_op == htons(ARPOP_REPLY) && - addr_type == RTN_UNICAST) || is_garp)) + (is_garp || + (arp->ar_op == htons(ARPOP_REPLY) && + (addr_type == RTN_UNICAST || + (addr_type < 0 && + /* postpone calculation to as late as possible */ + inet_addr_type_dev_table(net, dev, sip) == + RTN_UNICAST))))) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 65cc02bd82bc..93322f895eab 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -248,6 +248,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * u8 *tail; u8 *vaddr; int nfrags; + int esph_offset; struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; @@ -313,11 +314,13 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * } cow: + esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb); + nfrags = skb_cow_data(skb, tailen, &trailer); if (nfrags < 0) goto out; tail = skb_tail_pointer(trailer); - esp->esph = ip_esp_hdr(skb); + esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset); skip_cow: esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 39bd1edee676..83e3ed258467 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -763,7 +763,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; - int dumped = 0; + int dumped = 0, err; if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) @@ -783,20 +783,27 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); - if (fib_table_dump(tb, skb, cb) < 0) - goto out; + err = fib_table_dump(tb, skb, cb); + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } dumped = 1; next: e++; } } out: + err = skb->len; +out_err: rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; - return skb->len; + return err; } /* Prepare and feed intra-kernel routing request. diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1201409ba1dc..51182ff2b441 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1983,6 +1983,8 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + int err; + if (i < s_i) { i++; continue; @@ -1993,17 +1995,14 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, continue; } - if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, - fa->fa_type, - xkey, - KEYLENGTH - fa->fa_slen, - fa->fa_tos, - fa->fa_info, NLM_F_MULTI) < 0) { + err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, + tb->tb_id, fa->fa_type, + xkey, KEYLENGTH - fa->fa_slen, + fa->fa_tos, fa->fa_info, NLM_F_MULTI); + if (err < 0) { cb->args[4] = i; - return -1; + return err; } i++; } @@ -2025,10 +2024,13 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, t_key key = cb->args[3]; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { + int err; + + err = fn_trie_dump_leaf(l, tb, skb, cb); + if (err < 0) { cb->args[3] = key; cb->args[2] = count; - return -1; + return err; } ++count; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 5e313c1ac94f..1054d330bf9d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -794,6 +794,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, /* listeners have SOCK_RCU_FREE, not the children */ sock_reset_flag(newsk, SOCK_RCU_FREE); + inet_sk(newsk)->mc_list = NULL; + newsk->sk_mark = inet_rsk(req)->ir_mark; atomic64_set(&newsk->sk_cookie, atomic64_read(&inet_rsk(req)->ir_cookie)); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8bea74298173..e9a59d2d91d4 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -678,11 +678,7 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) /* no more locks than number of hash buckets */ nblocks = min(nblocks, hashinfo->ehash_mask + 1); - hashinfo->ehash_locks = kmalloc_array(nblocks, locksz, - GFP_KERNEL | __GFP_NOWARN); - if (!hashinfo->ehash_locks) - hashinfo->ehash_locks = vmalloc(nblocks * locksz); - + hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); if (!hashinfo->ehash_locks) return -ENOMEM; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3a02d52ed50e..551de4d023a8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1980,6 +1980,20 @@ int ip_mr_input(struct sk_buff *skb) struct net *net = dev_net(skb->dev); int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; struct mr_table *mrt; + struct net_device *dev; + + /* skb->dev passed in is the loX master dev for vrfs. + * As there are no vifs associated with loopback devices, + * get the proper interface that does have a vif associated with it. + */ + dev = skb->dev; + if (netif_is_l3_master(skb->dev)) { + dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); + if (!dev) { + kfree_skb(skb); + return -ENODEV; + } + } /* Packet is looped back after forward, it should not be * forwarded second time, but still can be delivered locally. @@ -2017,7 +2031,7 @@ int ip_mr_input(struct sk_buff *skb) /* already under rcu_read_lock() */ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); if (!cache) { - int vif = ipmr_find_vif(mrt, skb->dev); + int vif = ipmr_find_vif(mrt, dev); if (vif >= 0) cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, @@ -2037,7 +2051,7 @@ int ip_mr_input(struct sk_buff *skb) } read_lock(&mrt_lock); - vif = ipmr_find_vif(mrt, skb->dev); + vif = ipmr_find_vif(mrt, dev); if (vif >= 0) { int err2 = ipmr_cache_unresolved(mrt, vif, skb); read_unlock(&mrt_lock); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f647310f8e4d..655d9eebe43e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1387,11 +1387,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst) { struct rtable *rt = (struct rtable *) dst; - if (rt->fi) { - fib_info_put(rt->fi); - rt->fi = NULL; - } - if (!list_empty(&rt->rt_uncached)) { struct uncached_list *ul = rt->rt_uncached_list; @@ -1429,16 +1424,6 @@ static bool rt_cache_valid(const struct rtable *rt) !rt_is_expired(rt); } -static void rt_init_metrics(struct rtable *rt, struct fib_info *fi) -{ - if (fi->fib_metrics != (u32 *)dst_default_metrics) { - fib_info_hold(fi); - rt->fi = fi; - } - - dst_init_metrics(&rt->dst, fi->fib_metrics, true); -} - static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, struct fib_nh_exception *fnhe, @@ -1453,7 +1438,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->rt_gateway = nh->nh_gw; rt->rt_uses_gateway = 1; } - rt_init_metrics(rt, fi); + dst_init_metrics(&rt->dst, fi->fib_metrics, true); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif @@ -1505,7 +1490,6 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_gateway = 0; rt->rt_uses_gateway = 0; rt->rt_table_id = 0; - rt->fi = NULL; INIT_LIST_HEAD(&rt->rt_uncached); rt->dst.output = ip_output; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1e4c76d2b827..842b575f8fdd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2320,6 +2320,10 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 + * issue in __tcp_select_window() + */ + icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5a3ad09e2786..174d4376baa5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1179,13 +1179,14 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, */ if (pkt_len > mss) { unsigned int new_len = (pkt_len / mss) * mss; - if (!in_sack && new_len < pkt_len) { + if (!in_sack && new_len < pkt_len) new_len += mss; - if (new_len >= skb->len) - return 0; - } pkt_len = new_len; } + + if (pkt_len >= skb->len && !in_sack) + return 0; + err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; @@ -3189,7 +3190,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, int delta; /* Non-retransmitted hole got filled? That's reordering */ - if (reord < prior_fackets) + if (reord < prior_fackets && reord <= tp->fackets_out) tcp_update_reordering(sk, tp->fackets_out - reord, 0); delta = tcp_is_fack(tp) ? pkts_acked : diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3a51582bef55..5ab2aac5ca19 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2395,7 +2395,7 @@ struct proto tcp_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 9d0d4f39e42b..653bbd67e3a3 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1011,10 +1011,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) tcp_metrics_hash_log = order_base_2(slots); size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log; - tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!tcp_metrics_hash) - tcp_metrics_hash = vzalloc(size); - + tcp_metrics_hash = kvzalloc(size, GFP_KERNEL); if (!tcp_metrics_hash) return -ENOMEM; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ea6e4cff9faf..1d6219bf2d6b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1612,7 +1612,7 @@ static void udp_v4_rehash(struct sock *sk) udp_lib_rehash(sk, new_hash); } -int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -1657,7 +1657,7 @@ EXPORT_SYMBOL(udp_encap_enable); * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ -int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index feb50a16398d..a8cf8c6fb60c 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -25,7 +25,6 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); -int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 77a4bd526d6e..6a4fb1e629fb 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1022,7 +1022,10 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, INIT_HLIST_NODE(&ifa->addr_lst); ifa->scope = scope; ifa->prefix_len = pfxlen; - ifa->flags = flags | IFA_F_TENTATIVE; + ifa->flags = flags; + /* No need to add the TENTATIVE flag for addresses with NODAD */ + if (!(flags & IFA_F_NODAD)) + ifa->flags |= IFA_F_TENTATIVE; ifa->valid_lft = valid_lft; ifa->prefered_lft = prefered_lft; ifa->cstamp = ifa->tstamp = jiffies; @@ -3548,6 +3551,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, */ static struct notifier_block ipv6_dev_notf = { .notifier_call = addrconf_notify, + .priority = ADDRCONF_NOTIFY_PRIORITY, }; static void addrconf_type_change(struct net_device *dev, unsigned long event) diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index af8f52ee7180..2fd5ca151dcf 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -41,13 +41,7 @@ static int alloc_ila_locks(struct ila_net *ilan) size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU); if (sizeof(spinlock_t) != 0) { -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE) - ilan->locks = vmalloc(size * sizeof(spinlock_t)); - else -#endif - ilan->locks = kmalloc_array(size, sizeof(spinlock_t), - GFP_KERNEL); + ilan->locks = kvmalloc(size * sizeof(spinlock_t), GFP_KERNEL); if (!ilan->locks) return -ENOMEM; for (i = 0; i < size; i++) diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 93e58a5e1837..280268f1dd7b 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -63,7 +63,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, const struct net_offload *ops; int proto; struct frag_hdr *fptr; - unsigned int unfrag_ip6hlen; unsigned int payload_len; u8 *prevhdr; int offset = 0; @@ -116,8 +115,10 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, skb->network_header = (u8 *)ipv6h - skb->head; if (udpfrag) { - unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); - fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen); + int err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + return ERR_PTR(err); + fptr = (struct frag_hdr *)((u8 *)ipv6h + err); fptr->frag_off = htons(offset); if (skb->next) fptr->frag_off |= htons(IP6_MF); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 58f6288e9ba5..bf8a58a1c32d 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -597,7 +597,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; - hlen = ip6_find_1stfragopt(skb, &prevhdr); + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + goto fail; + hlen = err; nexthdr = *prevhdr; mtu = ip6_skb_dst_mtu(skb); @@ -1463,6 +1466,11 @@ alloc_new_skb: */ alloclen += sizeof(struct frag_hdr); + copy = datalen - transhdrlen - fraggap; + if (copy < 0) { + err = -EINVAL; + goto error; + } if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len, @@ -1512,13 +1520,9 @@ alloc_new_skb: data += fraggap; pskb_trim_unique(skb_prev, maxfraglen); } - copy = datalen - transhdrlen - fraggap; - - if (copy < 0) { - err = -EINVAL; - kfree_skb(skb); - goto error; - } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + if (copy > 0 && + getfrag(from, data + transhdrlen, offset, + copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index cd4252346a32..e9065b8d3af8 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -79,14 +79,13 @@ EXPORT_SYMBOL(ipv6_select_ident); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { u16 offset = sizeof(struct ipv6hdr); - struct ipv6_opt_hdr *exthdr = - (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); unsigned int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; - while (offset + 1 <= packet_len) { + while (offset <= packet_len) { + struct ipv6_opt_hdr *exthdr; switch (**nexthdr) { @@ -107,13 +106,16 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) return offset; } - offset += ipv6_optlen(exthdr); - *nexthdr = &exthdr->nexthdr; + if (offset + sizeof(struct ipv6_opt_hdr) > packet_len) + return -EINVAL; + exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; } - return offset; + return -EINVAL; } EXPORT_SYMBOL(ip6_find_1stfragopt); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2f1136627dcb..dc61b0b5e64e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3709,7 +3709,10 @@ static int ip6_route_dev_notify(struct notifier_block *this, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); - if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) { + if (!(dev->flags & IFF_LOOPBACK)) + return NOTIFY_OK; + + if (event == NETDEV_REGISTER) { net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -3718,6 +3721,12 @@ static int ip6_route_dev_notify(struct notifier_block *this, net->ipv6.ip6_blk_hole_entry->dst.dev = dev; net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); #endif + } else if (event == NETDEV_UNREGISTER) { + in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev); +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev); + in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev); +#endif } return NOTIFY_OK; @@ -4024,7 +4033,7 @@ static struct pernet_operations ip6_route_net_late_ops = { static struct notifier_block ip6_route_dev_notifier = { .notifier_call = ip6_route_dev_notify, - .priority = 0, + .priority = ADDRCONF_NOTIFY_PRIORITY - 10, }; void __init ip6_route_init_special_entries(void) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index aeb9497b5bb7..4f4310a36a04 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1062,6 +1062,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newtp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif + newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; @@ -1131,6 +1132,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * First: no IPv4 options. */ newinet->inet_opt = NULL; + newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; @@ -1917,7 +1919,7 @@ struct proto tcpv6_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, .h.hashinfo = &tcp_hashinfo, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 04862abfe4ec..06ec39b79609 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -526,7 +526,7 @@ out: return; } -int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; @@ -569,7 +569,7 @@ void udpv6_encap_enable(void) } EXPORT_SYMBOL(udpv6_encap_enable); -int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index e78bdc76dcc3..f180b3d85e31 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -26,7 +26,6 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); -int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udpv6_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index ac858c480f2f..a2267f80febb 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -29,6 +29,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u8 frag_hdr_sz = sizeof(struct frag_hdr); __wsum csum; int tnl_hlen; + int err; mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) @@ -90,7 +91,10 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, /* Find the unfragmentable header and shift it left by frag_hdr_sz * bytes to insert fragment header. */ - unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + return ERR_PTR(err); + unfrag_ip6hlen = err; nexthdr = *prevhdr; *prevhdr = NEXTHDR_FRAGMENT; unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + diff --git a/net/key/af_key.c b/net/key/af_key.c index c1950bb14735..512dc43d0ce6 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3285,7 +3285,7 @@ static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, p += pol->sadb_x_policy_len*8; sec_ctx = (struct sadb_x_sec_ctx *)p; if (len < pol->sadb_x_policy_len*8 + - sec_ctx->sadb_x_sec_len) { + sec_ctx->sadb_x_sec_len*8) { *dir = -EINVAL; goto out; } diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index cb4fff785cbf..8364fe5b59e4 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -142,7 +142,7 @@ static struct proto llc_proto = { .name = "LLC", .owner = THIS_MODULE, .obj_size = sizeof(struct llc_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, }; /** diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 8bc5a1bd2d45..9b02c13d258b 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -506,7 +506,7 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap, again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_estab_match(sap, daddr, laddr, rc)) { - /* Extra checks required by SLAB_DESTROY_BY_RCU */ + /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || @@ -565,7 +565,7 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap, again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_listener_match(sap, laddr, rc)) { - /* Extra checks required by SLAB_DESTROY_BY_RCU */ + /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 5404d0d195cc..63b6ab056370 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -328,7 +328,7 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap, again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_dgram_match(sap, laddr, rc)) { - /* Extra checks required by SLAB_DESTROY_BY_RCU */ + /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 088e2b459d0f..257ec66009da 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2005,10 +2005,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) unsigned index; if (size) { - labels = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); - if (!labels) - labels = vzalloc(size); - + labels = kvzalloc(size, GFP_KERNEL); if (!labels) goto nolabels; } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index d2d7bdf1d510..ad99c1ceea6f 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -849,10 +849,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, { unsigned int verdict = NF_DROP; - if (IP_VS_FWD_METHOD(cp) != 0) { - pr_err("shouldn't reach here, because the box is on the " - "half connection in the tun/dr module.\n"); - } + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + goto ignore_cp; /* Ensure the checksum is correct */ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { @@ -886,6 +884,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); + +ignore_cp: verdict = NF_ACCEPT; out: @@ -1385,8 +1385,11 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in */ cp = pp->conn_out_get(ipvs, af, skb, &iph); - if (likely(cp)) + if (likely(cp)) { + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + goto ignore_cp; return handle_response(af, skb, pd, cp, &iph, hooknum); + } /* Check for real-server-started requests */ if (atomic_read(&ipvs->conn_out_counter)) { @@ -1444,9 +1447,15 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in } } } + +out: IP_VS_DBG_PKT(12, af, pp, skb, iph.off, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; + +ignore_cp: + __ip_vs_conn_put(cp); + goto out; } /* diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3c8f1ed2f555..e847dbaa0c6b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -911,7 +911,7 @@ static unsigned int early_drop_list(struct net *net, continue; /* kill only if still in same netns -- might have moved due to - * SLAB_DESTROY_BY_RCU rules. + * SLAB_TYPESAFE_BY_RCU rules. * * We steal the timer reference. If that fails timer has * already fired or someone else deleted it. Just drop ref @@ -1114,7 +1114,7 @@ __nf_conntrack_alloc(struct net *net, /* * Do not use kmem_cache_zalloc(), as this cache uses - * SLAB_DESTROY_BY_RCU. + * SLAB_TYPESAFE_BY_RCU. */ ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); if (ct == NULL) @@ -1159,7 +1159,7 @@ void nf_conntrack_free(struct nf_conn *ct) struct net *net = nf_ct_net(ct); /* A freed object has refcnt == 0, that's - * the golden rule for SLAB_DESTROY_BY_RCU + * the golden rule for SLAB_TYPESAFE_BY_RCU */ NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); @@ -1929,7 +1929,7 @@ int nf_conntrack_init_start(void) nf_conntrack_cachep = kmem_cache_create("nf_conntrack", sizeof(struct nf_conn), NFCT_INFOMASK + 1, - SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); + SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); if (!nf_conntrack_cachep) goto err_cachep; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 3a60efa7799b..7f6100ca63be 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -174,6 +174,10 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) #endif if (h != NULL && !try_module_get(h->me)) h = NULL; + if (h != NULL && !refcount_inc_not_zero(&h->refcnt)) { + module_put(h->me); + h = NULL; + } rcu_read_unlock(); @@ -181,6 +185,13 @@ nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) } EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); +void nf_conntrack_helper_put(struct nf_conntrack_helper *helper) +{ + refcount_dec(&helper->refcnt); + module_put(helper->me); +} +EXPORT_SYMBOL_GPL(nf_conntrack_helper_put); + struct nf_conn_help * nf_ct_helper_ext_add(struct nf_conn *ct, struct nf_conntrack_helper *helper, gfp_t gfp) @@ -417,6 +428,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) } } } + refcount_set(&me->refcnt, 1); hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); nf_ct_helper_count++; out: diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index dcf561b5c97a..9799a50bc604 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -45,6 +45,8 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_conntrack_labels.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_synproxy.h> #ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_l4proto.h> @@ -1007,9 +1009,8 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { static int ctnetlink_parse_tuple(const struct nlattr * const cda[], - struct nf_conntrack_tuple *tuple, - enum ctattr_type type, u_int8_t l3num, - struct nf_conntrack_zone *zone) + struct nf_conntrack_tuple *tuple, u32 type, + u_int8_t l3num, struct nf_conntrack_zone *zone) { struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; @@ -1828,6 +1829,8 @@ ctnetlink_create_conntrack(struct net *net, nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); nf_ct_labels_ext_add(ct); + nfct_seqadj_ext_add(ct); + nfct_synproxy_ext_add(ct); /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; @@ -2447,7 +2450,7 @@ static struct nfnl_ct_hook ctnetlink_glue_hook = { static int ctnetlink_exp_dump_tuple(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, - enum ctattr_expect type) + u32 type) { struct nlattr *nest_parms; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index b48d6b5aae8a..ef0be325a0c6 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -409,6 +409,10 @@ nf_nat_setup_info(struct nf_conn *ct, { struct nf_conntrack_tuple curr_tuple, new_tuple; + /* Can't setup nat info for confirmed ct. */ + if (nf_ct_is_confirmed(ct)) + return NF_ACCEPT; + NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || maniptype == NF_NAT_MANIP_DST); BUG_ON(nf_nat_initialized(ct, maniptype)); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 559225029740..da314be0c048 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3367,35 +3367,50 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, return nf_tables_fill_setelem(args->skb, set, elem); } +struct nft_set_dump_ctx { + const struct nft_set *set; + struct nft_ctx ctx; +}; + static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { + struct nft_set_dump_ctx *dump_ctx = cb->data; struct net *net = sock_net(skb->sk); - u8 genmask = nft_genmask_cur(net); + struct nft_af_info *afi; + struct nft_table *table; struct nft_set *set; struct nft_set_dump_args args; - struct nft_ctx ctx; - struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1]; + bool set_found = false; struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct nlattr *nest; u32 portid, seq; - int event, err; + int event; - err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla, - NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy, - NULL); - if (err < 0) - return err; + rcu_read_lock(); + list_for_each_entry_rcu(afi, &net->nft.af_info, list) { + if (afi != dump_ctx->ctx.afi) + continue; - err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh, - (void *)nla, genmask); - if (err < 0) - return err; + list_for_each_entry_rcu(table, &afi->tables, list) { + if (table != dump_ctx->ctx.table) + continue; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], - genmask); - if (IS_ERR(set)) - return PTR_ERR(set); + list_for_each_entry_rcu(set, &table->sets, list) { + if (set == dump_ctx->set) { + set_found = true; + break; + } + } + break; + } + break; + } + + if (!set_found) { + rcu_read_unlock(); + return -ENOENT; + } event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWSETELEM); portid = NETLINK_CB(cb->skb).portid; @@ -3407,11 +3422,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) goto nla_put_failure; nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx.afi->family; + nfmsg->nfgen_family = afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(ctx.net->nft.base_seq & 0xffff); + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name)) + if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) goto nla_put_failure; @@ -3422,12 +3437,13 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.cb = cb; args.skb = skb; - args.iter.genmask = nft_genmask_cur(ctx.net); + args.iter.genmask = nft_genmask_cur(net); args.iter.skip = cb->args[0]; args.iter.count = 0; args.iter.err = 0; args.iter.fn = nf_tables_dump_setelem; - set->ops->walk(&ctx, set, &args.iter); + set->ops->walk(&dump_ctx->ctx, set, &args.iter); + rcu_read_unlock(); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); @@ -3441,9 +3457,16 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; nla_put_failure: + rcu_read_unlock(); return -ENOSPC; } +static int nf_tables_dump_set_done(struct netlink_callback *cb) +{ + kfree(cb->data); + return 0; +} + static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -3465,7 +3488,18 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_set, + .done = nf_tables_dump_set_done, }; + struct nft_set_dump_ctx *dump_ctx; + + dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL); + if (!dump_ctx) + return -ENOMEM; + + dump_ctx->set = set; + dump_ctx->ctx = ctx; + + c.data = dump_ctx; return netlink_dump_start(nlsk, skb, nlh, &c); } return -EOPNOTSUPP; @@ -3593,9 +3627,9 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); - nft_data_uninit(nft_set_ext_key(ext), NFT_DATA_VALUE); + nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) - nft_data_uninit(nft_set_ext_data(ext), set->dtype); + nft_data_release(nft_set_ext_data(ext), set->dtype); if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) @@ -3604,6 +3638,18 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); +/* Only called from commit path, nft_set_elem_deactivate() already deals with + * the refcounting from the preparation phase. + */ +static void nf_tables_set_elem_destroy(const struct nft_set *set, void *elem) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem); + + if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); + kfree(elem); +} + static int nft_setelem_parse_flags(const struct nft_set *set, const struct nlattr *attr, u32 *flags) { @@ -3815,9 +3861,9 @@ err4: kfree(elem.priv); err3: if (nla[NFTA_SET_ELEM_DATA] != NULL) - nft_data_uninit(&data, d2.type); + nft_data_release(&data, d2.type); err2: - nft_data_uninit(&elem.key.val, d1.type); + nft_data_release(&elem.key.val, d1.type); err1: return err; } @@ -3862,6 +3908,53 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, return err; } +/** + * nft_data_hold - hold a nft_data item + * + * @data: struct nft_data to release + * @type: type of data + * + * Hold a nft_data item. NFT_DATA_VALUE types can be silently discarded, + * NFT_DATA_VERDICT bumps the reference to chains in case of NFT_JUMP and + * NFT_GOTO verdicts. This function must be called on active data objects + * from the second phase of the commit protocol. + */ +static void nft_data_hold(const struct nft_data *data, enum nft_data_types type) +{ + if (type == NFT_DATA_VERDICT) { + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + data->verdict.chain->use++; + break; + } + } +} + +static void nft_set_elem_activate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) + nft_data_hold(nft_set_ext_data(ext), set->dtype); + if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) + (*nft_set_ext_obj(ext))->use++; +} + +static void nft_set_elem_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) + nft_data_release(nft_set_ext_data(ext), set->dtype); + if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) + (*nft_set_ext_obj(ext))->use--; +} + static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -3927,6 +4020,8 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, kfree(elem.priv); elem.priv = priv; + nft_set_elem_deactivate(ctx->net, set, &elem); + nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; @@ -3936,7 +4031,7 @@ err4: err3: kfree(elem.priv); err2: - nft_data_uninit(&elem.key.val, desc.type); + nft_data_release(&elem.key.val, desc.type); err1: return err; } @@ -4743,8 +4838,8 @@ static void nf_tables_commit_release(struct nft_trans *trans) nft_set_destroy(nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: - nft_set_elem_destroy(nft_trans_elem_set(trans), - nft_trans_elem(trans).priv, true); + nf_tables_set_elem_destroy(nft_trans_elem_set(trans), + nft_trans_elem(trans).priv); break; case NFT_MSG_DELOBJ: nft_obj_destroy(nft_trans_obj(trans)); @@ -4979,6 +5074,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) case NFT_MSG_DELSETELEM: te = (struct nft_trans_elem *)trans->data; + nft_set_elem_activate(net, te->set, &te->elem); te->set->ops->activate(net, te->set, &te->elem); te->set->ndeact--; @@ -5464,7 +5560,7 @@ int nft_data_init(const struct nft_ctx *ctx, EXPORT_SYMBOL_GPL(nft_data_init); /** - * nft_data_uninit - release a nft_data item + * nft_data_release - release a nft_data item * * @data: struct nft_data to release * @type: type of data @@ -5472,7 +5568,7 @@ EXPORT_SYMBOL_GPL(nft_data_init); * Release a nft_data item. NFT_DATA_VALUE types can be silently discarded, * all others need to be released by calling this function. */ -void nft_data_uninit(const struct nft_data *data, enum nft_data_types type) +void nft_data_release(const struct nft_data *data, enum nft_data_types type) { if (type < NFT_DATA_VERDICT) return; @@ -5483,7 +5579,7 @@ void nft_data_uninit(const struct nft_data *data, enum nft_data_types type) WARN_ON(1); } } -EXPORT_SYMBOL_GPL(nft_data_uninit); +EXPORT_SYMBOL_GPL(nft_data_release); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len) diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 950bf6eadc65..be678a323598 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -686,6 +686,7 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, tuple_set = true; } + ret = -ENOENT; list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) { cur = &nlcth->helper; j++; @@ -699,16 +700,20 @@ static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, tuple.dst.protonum != cur->tuple.dst.protonum)) continue; - found = true; - nf_conntrack_helper_unregister(cur); - kfree(cur->expect_policy); + if (refcount_dec_if_one(&cur->refcnt)) { + found = true; + nf_conntrack_helper_unregister(cur); + kfree(cur->expect_policy); - list_del(&nlcth->list); - kfree(nlcth); + list_del(&nlcth->list); + kfree(nlcth); + } else { + ret = -EBUSY; + } } /* Make sure we return success if we flush and there is no helpers */ - return (found || j == 0) ? 0 : -ENOENT; + return (found || j == 0) ? 0 : ret; } static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 877d9acd91ef..fff8073e2a56 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -83,17 +83,26 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, tb[NFTA_BITWISE_MASK]); if (err < 0) return err; - if (d1.len != priv->len) - return -EINVAL; + if (d1.len != priv->len) { + err = -EINVAL; + goto err1; + } err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2, tb[NFTA_BITWISE_XOR]); if (err < 0) - return err; - if (d2.len != priv->len) - return -EINVAL; + goto err1; + if (d2.len != priv->len) { + err = -EINVAL; + goto err2; + } return 0; +err2: + nft_data_release(&priv->xor, d2.type); +err1: + nft_data_release(&priv->mask, d1.type); + return err; } static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 2b96effeadc1..c2945eb3397c 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -201,10 +201,18 @@ nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) if (err < 0) return ERR_PTR(err); + if (desc.type != NFT_DATA_VALUE) { + err = -EINVAL; + goto err1; + } + if (desc.len <= sizeof(u32) && op == NFT_CMP_EQ) return &nft_cmp_fast_ops; - else - return &nft_cmp_ops; + + return &nft_cmp_ops; +err1: + nft_data_release(&data, desc.type); + return ERR_PTR(-EINVAL); } struct nft_expr_type nft_cmp_type __read_mostly = { diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index a34ceb38fc55..1678e9e75e8e 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -826,9 +826,9 @@ static void nft_ct_helper_obj_destroy(struct nft_object *obj) struct nft_ct_helper_obj *priv = nft_obj_data(obj); if (priv->helper4) - module_put(priv->helper4->me); + nf_conntrack_helper_put(priv->helper4); if (priv->helper6) - module_put(priv->helper6->me); + nf_conntrack_helper_put(priv->helper6); } static void nft_ct_helper_obj_eval(struct nft_object *obj, diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 728baf88295a..4717d7796927 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -65,7 +65,7 @@ static int nft_immediate_init(const struct nft_ctx *ctx, return 0; err1: - nft_data_uninit(&priv->data, desc.type); + nft_data_release(&priv->data, desc.type); return err; } @@ -73,7 +73,8 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); - return nft_data_uninit(&priv->data, nft_dreg_to_type(priv->dreg)); + + return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg)); } static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 9edc74eedc10..cedb96c3619f 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -102,9 +102,9 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr priv->len = desc_from.len; return 0; err2: - nft_data_uninit(&priv->data_to, desc_to.type); + nft_data_release(&priv->data_to, desc_to.type); err1: - nft_data_uninit(&priv->data_from, desc_from.type); + nft_data_release(&priv->data_from, desc_from.type); return err; } diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 8ec086b6b56b..3d3a6df4ce70 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -222,7 +222,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_elem elem; int err; - err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); + err = rhashtable_walk_init(&priv->ht, &hti, GFP_ATOMIC); iter->err = err; if (err) return; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index f134d384852f..1770c1d9b37f 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -283,28 +283,30 @@ static int xt_obj_to_user(u16 __user *psize, u16 size, &U->u.user.revision, K->u.kernel.TYPE->revision) int xt_data_to_user(void __user *dst, const void *src, - int usersize, int size) + int usersize, int size, int aligned_size) { usersize = usersize ? : size; if (copy_to_user(dst, src, usersize)) return -EFAULT; - if (usersize != size && clear_user(dst + usersize, size - usersize)) + if (usersize != aligned_size && + clear_user(dst + usersize, aligned_size - usersize)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(xt_data_to_user); -#define XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \ +#define XT_DATA_TO_USER(U, K, TYPE) \ xt_data_to_user(U->data, K->data, \ K->u.kernel.TYPE->usersize, \ - C_SIZE ? : K->u.kernel.TYPE->TYPE##size) + K->u.kernel.TYPE->TYPE##size, \ + XT_ALIGN(K->u.kernel.TYPE->TYPE##size)) int xt_match_to_user(const struct xt_entry_match *m, struct xt_entry_match __user *u) { return XT_OBJ_TO_USER(u, m, match, 0) || - XT_DATA_TO_USER(u, m, match, 0); + XT_DATA_TO_USER(u, m, match); } EXPORT_SYMBOL_GPL(xt_match_to_user); @@ -312,7 +314,7 @@ int xt_target_to_user(const struct xt_entry_target *t, struct xt_entry_target __user *u) { return XT_OBJ_TO_USER(u, t, target, 0) || - XT_DATA_TO_USER(u, t, target, 0); + XT_DATA_TO_USER(u, t, target); } EXPORT_SYMBOL_GPL(xt_target_to_user); @@ -611,6 +613,12 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, } EXPORT_SYMBOL_GPL(xt_compat_match_from_user); +#define COMPAT_XT_DATA_TO_USER(U, K, TYPE, C_SIZE) \ + xt_data_to_user(U->data, K->data, \ + K->u.kernel.TYPE->usersize, \ + C_SIZE, \ + COMPAT_XT_ALIGN(C_SIZE)) + int xt_compat_match_to_user(const struct xt_entry_match *m, void __user **dstptr, unsigned int *size) { @@ -626,7 +634,7 @@ int xt_compat_match_to_user(const struct xt_entry_match *m, if (match->compat_to_user((void __user *)cm->data, m->data)) return -EFAULT; } else { - if (XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm))) + if (COMPAT_XT_DATA_TO_USER(cm, m, match, msize - sizeof(*cm))) return -EFAULT; } @@ -763,17 +771,8 @@ EXPORT_SYMBOL(xt_check_entry_offsets); */ unsigned int *xt_alloc_entry_offsets(unsigned int size) { - unsigned int *off; - - off = kcalloc(size, sizeof(unsigned int), GFP_KERNEL | __GFP_NOWARN); + return kvmalloc_array(size, sizeof(unsigned int), GFP_KERNEL | __GFP_ZERO); - if (off) - return off; - - if (size < (SIZE_MAX / sizeof(unsigned int))) - off = vmalloc(size * sizeof(unsigned int)); - - return off; } EXPORT_SYMBOL(xt_alloc_entry_offsets); @@ -981,7 +980,7 @@ int xt_compat_target_to_user(const struct xt_entry_target *t, if (target->compat_to_user((void __user *)ct->data, t->data)) return -EFAULT; } else { - if (XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct))) + if (COMPAT_XT_DATA_TO_USER(ct, t, target, tsize - sizeof(*ct))) return -EFAULT; } @@ -1007,8 +1006,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); if (!info) { - info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN | - __GFP_NORETRY | __GFP_HIGHMEM, + info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, PAGE_KERNEL); if (!info) return NULL; @@ -1116,7 +1114,7 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) size = sizeof(void **) * nr_cpu_ids; if (size > PAGE_SIZE) - i->jumpstack = vzalloc(size); + i->jumpstack = kvzalloc(size, GFP_KERNEL); else i->jumpstack = kzalloc(size, GFP_KERNEL); if (i->jumpstack == NULL) @@ -1138,12 +1136,8 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) */ size = sizeof(void *) * i->stacksize * 2u; for_each_possible_cpu(cpu) { - if (size > PAGE_SIZE) - i->jumpstack[cpu] = vmalloc_node(size, - cpu_to_node(cpu)); - else - i->jumpstack[cpu] = kmalloc_node(size, - GFP_KERNEL, cpu_to_node(cpu)); + i->jumpstack[cpu] = kvmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); if (i->jumpstack[cpu] == NULL) /* * Freeing will be done later on by the callers. The diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index bb7ad82dcd56..623ef37de886 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -96,7 +96,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name, help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL); if (help == NULL) { - module_put(helper->me); + nf_conntrack_helper_put(helper); return -ENOMEM; } @@ -263,7 +263,7 @@ out: err4: help = nfct_help(ct); if (help) - module_put(help->helper->me); + nf_conntrack_helper_put(help->helper); err3: nf_ct_tmpl_free(ct); err2: @@ -346,7 +346,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, if (ct) { help = nfct_help(ct); if (help) - module_put(help->helper->me); + nf_conntrack_helper_put(help->helper); nf_ct_netns_put(par->net, par->family); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 37d581a31cff..3f6c4fa78bdb 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -388,10 +388,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par, } sz = sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size; - if (sz <= PAGE_SIZE) - t = kzalloc(sz, GFP_KERNEL); - else - t = vzalloc(sz); + t = kvzalloc(sz, GFP_KERNEL); if (t == NULL) { ret = -ENOMEM; goto out; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index bf602e33c40a..08679ebb3068 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1123,7 +1123,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); if (!help) { - module_put(helper->me); + nf_conntrack_helper_put(helper); return -ENOMEM; } @@ -1584,7 +1584,7 @@ void ovs_ct_free_action(const struct nlattr *a) static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) { if (ct_info->helper) - module_put(ct_info->helper->me); + nf_conntrack_helper_put(ct_info->helper); if (ct_info->ct) nf_ct_tmpl_free(ct_info->ct); } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index f4001763134d..e3eeed19cc7a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2658,13 +2658,6 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); } - sockc.tsflags = po->sk.sk_tsflags; - if (msg->msg_controllen) { - err = sock_cmsg_send(&po->sk, msg, &sockc); - if (unlikely(err)) - goto out; - } - err = -ENXIO; if (unlikely(dev == NULL)) goto out; @@ -2672,6 +2665,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if (unlikely(!(dev->flags & IFF_UP))) goto out_put; + sockc.tsflags = po->sk.sk_tsflags; + if (msg->msg_controllen) { + err = sock_cmsg_send(&po->sk, msg, &sockc); + if (unlikely(err)) + goto out_put; + } + if (po->sk.sk_socket->type == SOCK_RAW) reserve = dev->hard_header_len; size_max = po->tx_ring.frame_size diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index dee469fed967..51859b8edd7e 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -203,7 +203,6 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, *arg = (unsigned long) head; rcu_assign_pointer(tp->root, new); - call_rcu(&head->rcu, mall_destroy_rcu); return 0; err_replace_hw_filter: diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index bbe57d57b67f..e88342fde1bc 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1831,6 +1831,12 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, if (!qdisc_dev(root)) return 0; + if (tcm->tcm_parent) { + q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent)); + if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) + return -1; + return 0; + } hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index d00f4c7c2f3a..b30a2c70bd48 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -376,10 +376,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) if (mask != q->tab_mask) { struct sk_buff **ntab; - ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), - GFP_KERNEL | __GFP_NOWARN); - if (!ntab) - ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *)); + ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO); if (!ntab) return -ENOMEM; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index da4f67bda0ee..b488721a0059 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -624,16 +624,6 @@ static void fq_rehash(struct fq_sched_data *q, q->stat_gc_flows += fcnt; } -static void *fq_alloc_node(size_t sz, int node) -{ - void *ptr; - - ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); - if (!ptr) - ptr = vmalloc_node(sz, node); - return ptr; -} - static void fq_free(void *addr) { kvfree(addr); @@ -650,7 +640,7 @@ static int fq_resize(struct Qdisc *sch, u32 log) return 0; /* If XPS was setup, we can allocate memory on right NUMA node */ - array = fq_alloc_node(sizeof(struct rb_root) << log, + array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_REPEAT, netdev_queue_numa_node_read(sch->dev_queue)); if (!array) return -ENOMEM; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 18bbb5476c83..9201abce928c 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -446,27 +446,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) return 0; } -static void *fq_codel_zalloc(size_t sz) -{ - void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); - - if (!ptr) - ptr = vzalloc(sz); - return ptr; -} - -static void fq_codel_free(void *addr) -{ - kvfree(addr); -} - static void fq_codel_destroy(struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); tcf_destroy_chain(&q->filter_list); - fq_codel_free(q->backlogs); - fq_codel_free(q->flows); + kvfree(q->backlogs); + kvfree(q->flows); } static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) @@ -493,13 +479,13 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) } if (!q->flows) { - q->flows = fq_codel_zalloc(q->flows_cnt * - sizeof(struct fq_codel_flow)); + q->flows = kvzalloc(q->flows_cnt * + sizeof(struct fq_codel_flow), GFP_KERNEL); if (!q->flows) return -ENOMEM; - q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32)); + q->backlogs = kvzalloc(q->flows_cnt * sizeof(u32), GFP_KERNEL); if (!q->backlogs) { - fq_codel_free(q->flows); + kvfree(q->flows); return -ENOMEM; } for (i = 0; i < q->flows_cnt; i++) { diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index c19d346e6c5a..51d3ba682af9 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -467,29 +467,14 @@ static void hhf_reset(struct Qdisc *sch) rtnl_kfree_skbs(skb, skb); } -static void *hhf_zalloc(size_t sz) -{ - void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); - - if (!ptr) - ptr = vzalloc(sz); - - return ptr; -} - -static void hhf_free(void *addr) -{ - kvfree(addr); -} - static void hhf_destroy(struct Qdisc *sch) { int i; struct hhf_sched_data *q = qdisc_priv(sch); for (i = 0; i < HHF_ARRAYS_CNT; i++) { - hhf_free(q->hhf_arrays[i]); - hhf_free(q->hhf_valid_bits[i]); + kvfree(q->hhf_arrays[i]); + kvfree(q->hhf_valid_bits[i]); } for (i = 0; i < HH_FLOWS_CNT; i++) { @@ -503,7 +488,7 @@ static void hhf_destroy(struct Qdisc *sch) kfree(flow); } } - hhf_free(q->hh_flows); + kvfree(q->hh_flows); } static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { @@ -609,8 +594,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt) if (!q->hh_flows) { /* Initialize heavy-hitter flow table. */ - q->hh_flows = hhf_zalloc(HH_FLOWS_CNT * - sizeof(struct list_head)); + q->hh_flows = kvzalloc(HH_FLOWS_CNT * + sizeof(struct list_head), GFP_KERNEL); if (!q->hh_flows) return -ENOMEM; for (i = 0; i < HH_FLOWS_CNT; i++) @@ -624,8 +609,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt) /* Initialize heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { - q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN * - sizeof(u32)); + q->hhf_arrays[i] = kvzalloc(HHF_ARRAYS_LEN * + sizeof(u32), GFP_KERNEL); if (!q->hhf_arrays[i]) { /* Note: hhf_destroy() will be called * by our caller. @@ -637,8 +622,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt) /* Initialize valid bits of heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { - q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN / - BITS_PER_BYTE); + q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN / + BITS_PER_BYTE, GFP_KERNEL); if (!q->hhf_valid_bits[i]) { /* Note: hhf_destroy() will be called * by our caller. diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index f0ce4780f395..1b3dd6190e93 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -702,15 +702,11 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) spinlock_t *root_lock; struct disttable *d; int i; - size_t s; if (n > NETEM_DIST_MAX) return -EINVAL; - s = sizeof(struct disttable) + n * sizeof(s16); - d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN); - if (!d) - d = vmalloc(s); + d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL); if (!d) return -ENOMEM; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index b00e02c139de..332d94be6e1c 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -685,11 +685,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) static void *sfq_alloc(size_t sz) { - void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN); - - if (!ptr) - ptr = vmalloc(sz); - return ptr; + return kvmalloc(sz, GFP_KERNEL); } static void sfq_free(void *addr) diff --git a/net/sctp/associola.c b/net/sctp/associola.c index a9708da28eb5..95238284c422 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1176,7 +1176,9 @@ void sctp_assoc_update(struct sctp_association *asoc, asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; - if (!asoc->stream) { + + if (sctp_state(asoc, COOKIE_WAIT)) { + sctp_stream_free(asoc->stream); asoc->stream = new->stream; new->stream = NULL; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 961ee59f696a..f5b45b8b8b16 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -240,12 +240,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct sctp_bind_addr *bp; struct ipv6_pinfo *np = inet6_sk(sk); struct sctp_sockaddr_entry *laddr; - union sctp_addr *baddr = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; struct in6_addr *final_p, final; __u8 matchlen = 0; - __u8 bmatchlen; sctp_scope_t scope; memset(fl6, 0, sizeof(struct flowi6)); @@ -312,23 +310,37 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, */ rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { - if (!laddr->valid) + struct dst_entry *bdst; + __u8 bmatchlen; + + if (!laddr->valid || + laddr->state != SCTP_ADDR_SRC || + laddr->a.sa.sa_family != AF_INET6 || + scope > sctp_scope(&laddr->a)) continue; - if ((laddr->state == SCTP_ADDR_SRC) && - (laddr->a.sa.sa_family == AF_INET6) && - (scope <= sctp_scope(&laddr->a))) { - bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); - if (!baddr || (matchlen < bmatchlen)) { - baddr = &laddr->a; - matchlen = bmatchlen; - } - } - } - if (baddr) { - fl6->saddr = baddr->v6.sin6_addr; - fl6->fl6_sport = baddr->v6.sin6_port; + + fl6->saddr = laddr->a.v6.sin6_addr; + fl6->fl6_sport = laddr->a.v6.sin6_port; final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); - dst = ip6_dst_lookup_flow(sk, fl6, final_p); + bdst = ip6_dst_lookup_flow(sk, fl6, final_p); + + if (!IS_ERR(bdst) && + ipv6_chk_addr(dev_net(bdst->dev), + &laddr->a.v6.sin6_addr, bdst->dev, 1)) { + if (!IS_ERR_OR_NULL(dst)) + dst_release(dst); + dst = bdst; + break; + } + + bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); + if (matchlen > bmatchlen) + continue; + + if (!IS_ERR_OR_NULL(dst)) + dst_release(dst); + dst = bdst; + matchlen = bmatchlen; } rcu_read_unlock(); @@ -665,6 +677,9 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, newnp = inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + newnp->ipv6_fl_list = NULL; rcu_read_lock(); opt = rcu_dereference(np->opt); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 8a08f13469c4..92e332e17391 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2454,16 +2454,11 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, * stream sequence number shall be set to 0. */ - /* Allocate storage for the negotiated streams if it is not a temporary - * association. - */ - if (!asoc->temp) { - if (sctp_stream_init(asoc, gfp)) - goto clean_up; + if (sctp_stream_init(asoc, gfp)) + goto clean_up; - if (sctp_assoc_set_id(asoc, gfp)) - goto clean_up; - } + if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) + goto clean_up; /* ADDIP Section 4.1 ASCONF Chunk Procedures * diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 4f5e6cfc7f60..f863b5573e42 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2088,6 +2088,9 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupcook(struct net *net, } } + /* Set temp so that it won't be added into hashtable */ + new_asoc->temp = 1; + /* Compare the tie_tag in cookie with the verification tag of * current association. */ diff --git a/net/smc/Kconfig b/net/smc/Kconfig index c717ef0896aa..33954852f3f8 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -8,6 +8,10 @@ config SMC The Linux implementation of the SMC-R solution is designed as a separate socket family SMC. + Warning: SMC will expose all memory for remote reads and writes + once a connection is established. Don't enable this option except + for tightly controlled lab environment. + Select this option if you want to run SMC socket applications config SMC_DIAG diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5b6ee21368a6..6793d7348cc8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -101,7 +101,7 @@ struct proto smc_proto = { .unhash = smc_unhash_sk, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, }; EXPORT_SYMBOL_GPL(smc_proto); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index e41f594a1e1d..03ec058d18df 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -204,7 +204,7 @@ int smc_clc_send_confirm(struct smc_sock *smc) memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); hton24(cclc.qpn, link->roce_qp->qp_num); cclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]); cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */ cclc.rmbe_alert_token = htonl(conn->alert_token_local); cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); @@ -256,7 +256,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); hton24(aclc.qpn, link->roce_qp->qp_num); aclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->rkey[SMC_SINGLE_LINK]); aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.qp_mtu = link->path_mtu; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 65020e93ff21..3ac09a629ea1 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -613,19 +613,8 @@ int smc_rmb_create(struct smc_sock *smc) rmb_desc = NULL; continue; /* if mapping failed, try smaller one */ } - rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, - &rmb_desc->mr_rx[SMC_SINGLE_LINK]); - if (rc) { - smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - tmp_bufsize, rmb_desc, - DMA_FROM_DEVICE); - kfree(rmb_desc->cpu_addr); - kfree(rmb_desc); - rmb_desc = NULL; - continue; - } + rmb_desc->rkey[SMC_SINGLE_LINK] = + lgr->lnk[SMC_SINGLE_LINK].roce_pd->unsafe_global_rkey; rmb_desc->used = 1; write_lock_bh(&lgr->rmbs_lock); list_add(&rmb_desc->list, @@ -668,6 +657,7 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && + (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && test_bit(i, lgr->rtokens_used_mask)) { conn->rtoken_idx = i; return 0; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 27eb38056a27..b013cb43a327 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -93,7 +93,7 @@ struct smc_buf_desc { u64 dma_addr[SMC_LINKS_PER_LGR_MAX]; /* mapped address of buffer */ void *cpu_addr; /* virtual address of buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + u32 rkey[SMC_LINKS_PER_LGR_MAX]; /* for rmb only: * rkey provided to peer */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index cb69ab977cd7..b31715505a35 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -37,24 +37,6 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system * identifier */ -int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct ib_mr **mr) -{ - int rc; - - if (*mr) - return 0; /* already done */ - - /* obtain unique key - - * next invocation of get_dma_mr returns a different key! - */ - *mr = pd->device->get_dma_mr(pd, access_flags); - rc = PTR_ERR_OR_ZERO(*mr); - if (IS_ERR(*mr)) - *mr = NULL; - return rc; -} - static int smc_ib_modify_qp_init(struct smc_link *lnk) { struct ib_qp_attr qp_attr; @@ -210,7 +192,8 @@ int smc_ib_create_protection_domain(struct smc_link *lnk) { int rc; - lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); + lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, + IB_PD_UNSAFE_GLOBAL_RKEY); rc = PTR_ERR_OR_ZERO(lnk->roce_pd); if (IS_ERR(lnk->roce_pd)) lnk->roce_pd = NULL; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 7e1f0e24d177..b567152a526d 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -61,8 +61,6 @@ void smc_ib_dealloc_protection_domain(struct smc_link *lnk); int smc_ib_create_protection_domain(struct smc_link *lnk); void smc_ib_destroy_queue_pair(struct smc_link *lnk); int smc_ib_create_queue_pair(struct smc_link *lnk); -int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct ib_mr **mr); int smc_ib_ready_link(struct smc_link *lnk); int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk); diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 04ce2c0b660e..ac09ca803296 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -52,6 +52,7 @@ config SUNRPC_XPRT_RDMA tristate "RPC-over-RDMA transport" depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS default SUNRPC && INFINIBAND + select SG_POOL help This option allows the NFS client and server to use RDMA transports (InfiniBand, iWARP, or RoCE). diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 52da3ce54bb5..b5cb921775a0 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1042,8 +1042,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) struct rpc_task *task; task = rpc_new_task(task_setup_data); - if (IS_ERR(task)) - goto out; rpc_task_set_client(task, task_setup_data->rpc_client); rpc_task_set_rpc_message(task, task_setup_data->rpc_message); @@ -1053,7 +1051,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) atomic_inc(&task->tk_count); rpc_execute(task); -out: return task; } EXPORT_SYMBOL_GPL(rpc_run_task); @@ -1140,10 +1137,6 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) * Create an rpc_task to send the data */ task = rpc_new_task(&task_setup_data); - if (IS_ERR(task)) { - xprt_free_bc_request(req); - goto out; - } task->tk_rqstp = req; /* @@ -1158,7 +1151,6 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) WARN_ON_ONCE(atomic_read(&task->tk_count) != 2); rpc_execute(task); -out: dprintk("RPC: rpc_run_bc_task: task= %p\n", task); return task; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 5db68b371db2..0cc83839c13c 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -965,11 +965,6 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data) if (task == NULL) { task = rpc_alloc_task(); - if (task == NULL) { - rpc_release_calldata(setup_data->callback_ops, - setup_data->callback_data); - return ERR_PTR(-ENOMEM); - } flags = RPC_TASK_DYNAMIC; } diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index a08aeb56b8e4..bc0f5a0ecbdc 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -702,59 +702,32 @@ found_pool: return task; } -/* - * Create or destroy enough new threads to make the number - * of threads the given number. If `pool' is non-NULL, applies - * only to threads in that pool, otherwise round-robins between - * all pools. Caller must ensure that mutual exclusion between this and - * server startup or shutdown. - * - * Destroying threads relies on the service threads filling in - * rqstp->rq_task, which only the nfs ones do. Assumes the serv - * has been created using svc_create_pooled(). - * - * Based on code that used to be in nfsd_svc() but tweaked - * to be pool-aware. - */ -int -svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +/* create new threads */ +static int +svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { struct svc_rqst *rqstp; struct task_struct *task; struct svc_pool *chosen_pool; - int error = 0; unsigned int state = serv->sv_nrthreads-1; int node; - if (pool == NULL) { - /* The -1 assumes caller has done a svc_get() */ - nrservs -= (serv->sv_nrthreads-1); - } else { - spin_lock_bh(&pool->sp_lock); - nrservs -= pool->sp_nrthreads; - spin_unlock_bh(&pool->sp_lock); - } - - /* create new threads */ - while (nrservs > 0) { + do { nrservs--; chosen_pool = choose_pool(serv, pool, &state); node = svc_pool_map_get_node(chosen_pool->sp_id); rqstp = svc_prepare_thread(serv, chosen_pool, node); - if (IS_ERR(rqstp)) { - error = PTR_ERR(rqstp); - break; - } + if (IS_ERR(rqstp)) + return PTR_ERR(rqstp); __module_get(serv->sv_ops->svo_module); task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp, node, "%s", serv->sv_name); if (IS_ERR(task)) { - error = PTR_ERR(task); module_put(serv->sv_ops->svo_module); svc_exit_thread(rqstp); - break; + return PTR_ERR(task); } rqstp->rq_task = task; @@ -763,18 +736,103 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) svc_sock_update_bufs(serv); wake_up_process(task); - } + } while (nrservs > 0); + + return 0; +} + + +/* destroy old threads */ +static int +svc_signal_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + struct task_struct *task; + unsigned int state = serv->sv_nrthreads-1; + /* destroy old threads */ - while (nrservs < 0 && - (task = choose_victim(serv, pool, &state)) != NULL) { + do { + task = choose_victim(serv, pool, &state); + if (task == NULL) + break; send_sig(SIGINT, task, 1); nrservs++; + } while (nrservs < 0); + + return 0; +} + +/* + * Create or destroy enough new threads to make the number + * of threads the given number. If `pool' is non-NULL, applies + * only to threads in that pool, otherwise round-robins between + * all pools. Caller must ensure that mutual exclusion between this and + * server startup or shutdown. + * + * Destroying threads relies on the service threads filling in + * rqstp->rq_task, which only the nfs ones do. Assumes the serv + * has been created using svc_create_pooled(). + * + * Based on code that used to be in nfsd_svc() but tweaked + * to be pool-aware. + */ +int +svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + if (pool == NULL) { + /* The -1 assumes caller has done a svc_get() */ + nrservs -= (serv->sv_nrthreads-1); + } else { + spin_lock_bh(&pool->sp_lock); + nrservs -= pool->sp_nrthreads; + spin_unlock_bh(&pool->sp_lock); } - return error; + if (nrservs > 0) + return svc_start_kthreads(serv, pool, nrservs); + if (nrservs < 0) + return svc_signal_kthreads(serv, pool, nrservs); + return 0; } EXPORT_SYMBOL_GPL(svc_set_num_threads); +/* destroy old threads */ +static int +svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + struct task_struct *task; + unsigned int state = serv->sv_nrthreads-1; + + /* destroy old threads */ + do { + task = choose_victim(serv, pool, &state); + if (task == NULL) + break; + kthread_stop(task); + nrservs++; + } while (nrservs < 0); + return 0; +} + +int +svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + if (pool == NULL) { + /* The -1 assumes caller has done a svc_get() */ + nrservs -= (serv->sv_nrthreads-1); + } else { + spin_lock_bh(&pool->sp_lock); + nrservs -= pool->sp_nrthreads; + spin_unlock_bh(&pool->sp_lock); + } + + if (nrservs > 0) + return svc_start_kthreads(serv, pool, nrservs); + if (nrservs < 0) + return svc_stop_kthreads(serv, pool, nrservs); + return 0; +} +EXPORT_SYMBOL_GPL(svc_set_num_threads_sync); + /* * Called from a server thread as it's exiting. Caller must hold the "service * mutex" for the service. diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 1f7082144e01..e34f4ee7f2b6 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -807,7 +807,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) EXPORT_SYMBOL_GPL(xdr_init_decode); /** - * xdr_init_decode - Initialize an xdr_stream for decoding data. + * xdr_init_decode_pages - Initialize an xdr_stream for decoding into pages * @xdr: pointer to xdr_stream struct * @buf: pointer to XDR buffer from which to decode data * @pages: list of pages to decode into diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index b530a2852ba8..3e63c5e97ebe 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -651,6 +651,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } +EXPORT_SYMBOL_GPL(xprt_force_disconnect); /** * xprt_conditional_disconnect - force a transport to disconnect diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index ef19fa42c50f..c1ae8142ab73 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -4,5 +4,5 @@ rpcrdma-y := transport.o rpc_rdma.o verbs.o \ fmr_ops.o frwr_ops.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ - module.o + svc_rdma_rw.o module.o rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index a044be2d6ad7..694e9b13ecf0 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -494,7 +494,7 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, } sge->length = len; - ib_dma_sync_single_for_device(ia->ri_device, sge->addr, + ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, DMA_TO_DEVICE); req->rl_send_wr.num_sge++; return true; @@ -523,7 +523,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, sge[sge_no].addr = rdmab_addr(rb); sge[sge_no].length = xdr->head[0].iov_len; sge[sge_no].lkey = rdmab_lkey(rb); - ib_dma_sync_single_for_device(device, sge[sge_no].addr, + ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr, sge[sge_no].length, DMA_TO_DEVICE); /* If there is a Read chunk, the page list is being handled @@ -781,9 +781,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) return 0; out_err: - pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", - PTR_ERR(iptr)); - r_xprt->rx_stats.failed_marshal_count++; + if (PTR_ERR(iptr) != -ENOBUFS) { + pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", + PTR_ERR(iptr)); + r_xprt->rx_stats.failed_marshal_count++; + } return PTR_ERR(iptr); } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index c846ca9f1eba..a4a8f6989ee7 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -58,9 +58,9 @@ unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS; static unsigned int min_max_requests = 4; static unsigned int max_max_requests = 16384; -unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; -static unsigned int min_max_inline = 4096; -static unsigned int max_max_inline = 65536; +unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH; +static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH; +static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH; atomic_t rdma_stat_recv; atomic_t rdma_stat_read; @@ -247,8 +247,6 @@ int svc_rdma_init(void) dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); dprintk("\tmax_requests : %u\n", svcrdma_max_requests); - dprintk("\tsq_depth : %u\n", - svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index ff1df40f0d26..c676ed0efb5a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -12,7 +12,17 @@ #undef SVCRDMA_BACKCHANNEL_DEBUG -int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, +/** + * svc_rdma_handle_bc_reply - Process incoming backchannel reply + * @xprt: controlling backchannel transport + * @rdma_resp: pointer to incoming transport header + * @rcvbuf: XDR buffer into which to decode the reply + * + * Returns: + * %0 if @rcvbuf is filled in, xprt_complete_rqst called, + * %-EAGAIN if server should call ->recvfrom again. + */ +int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, struct xdr_buf *rcvbuf) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); @@ -27,13 +37,13 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, p = (__be32 *)src->iov_base; len = src->iov_len; - xid = rmsgp->rm_xid; + xid = *rdma_resp; #ifdef SVCRDMA_BACKCHANNEL_DEBUG pr_info("%s: xid=%08x, length=%zu\n", __func__, be32_to_cpu(xid), len); pr_info("%s: RPC/RDMA: %*ph\n", - __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp); + __func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp); pr_info("%s: RPC: %*ph\n", __func__, (int)len, p); #endif @@ -53,7 +63,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, goto out_unlock; memcpy(dst->iov_base, p, len); - credits = be32_to_cpu(rmsgp->rm_credit); + credits = be32_to_cpup(rdma_resp + 2); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_buf.rb_bc_max_requests) @@ -90,9 +100,9 @@ out_notfound: * Caller holds the connection's mutex and has already marshaled * the RPC/RDMA request. * - * This is similar to svc_rdma_reply, but takes an rpc_rqst - * instead, does not support chunks, and avoids blocking memory - * allocation. + * This is similar to svc_rdma_send_reply_msg, but takes a struct + * rpc_rqst instead, does not support chunks, and avoids blocking + * memory allocation. * * XXX: There is still an opportunity to block in svc_rdma_send() * if there are no SQ entries to post the Send. This may occur if @@ -101,59 +111,36 @@ out_notfound: static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) { - struct xdr_buf *sndbuf = &rqst->rq_snd_buf; struct svc_rdma_op_ctxt *ctxt; - struct svc_rdma_req_map *vec; - struct ib_send_wr send_wr; int ret; - vec = svc_rdma_get_req_map(rdma); - ret = svc_rdma_map_xdr(rdma, sndbuf, vec, false); - if (ret) + ctxt = svc_rdma_get_context(rdma); + + /* rpcrdma_bc_send_request builds the transport header and + * the backchannel RPC message in the same buffer. Thus only + * one SGE is needed to send both. + */ + ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer, + rqst->rq_snd_buf.len); + if (ret < 0) goto out_err; ret = svc_rdma_repost_recv(rdma, GFP_NOIO); if (ret) goto out_err; - ctxt = svc_rdma_get_context(rdma); - ctxt->pages[0] = virt_to_page(rqst->rq_buffer); - ctxt->count = 1; - - ctxt->direction = DMA_TO_DEVICE; - ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; - ctxt->sge[0].length = sndbuf->len; - ctxt->sge[0].addr = - ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0, - sndbuf->len, DMA_TO_DEVICE); - if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) { - ret = -EIO; - goto out_unmap; - } - svc_rdma_count_mappings(rdma, ctxt); - - memset(&send_wr, 0, sizeof(send_wr)); - ctxt->cqe.done = svc_rdma_wc_send; - send_wr.wr_cqe = &ctxt->cqe; - send_wr.sg_list = ctxt->sge; - send_wr.num_sge = 1; - send_wr.opcode = IB_WR_SEND; - send_wr.send_flags = IB_SEND_SIGNALED; - - ret = svc_rdma_send(rdma, &send_wr); - if (ret) { - ret = -EIO; + ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0); + if (ret) goto out_unmap; - } out_err: - svc_rdma_put_req_map(rdma, vec); dprintk("svcrdma: %s returns %d\n", __func__, ret); return ret; out_unmap: svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 1); + ret = -EIO; goto out_err; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c index 1c4aabf0f657..bdcf7d85a3dc 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -166,92 +166,3 @@ out_inval: dprintk("svcrdma: failed to parse transport header\n"); return -EINVAL; } - -int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rmsgp, - enum rpcrdma_errcode err, __be32 *va) -{ - __be32 *startp = va; - - *va++ = rmsgp->rm_xid; - *va++ = rmsgp->rm_vers; - *va++ = xprt->sc_fc_credits; - *va++ = rdma_error; - *va++ = cpu_to_be32(err); - if (err == ERR_VERS) { - *va++ = rpcrdma_version; - *va++ = rpcrdma_version; - } - - return (int)((unsigned long)va - (unsigned long)startp); -} - -/** - * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header - * @rdma_resp: buffer containing Reply transport header - * - * Returns length of transport header, in bytes. - */ -unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp) -{ - unsigned int nsegs; - __be32 *p; - - p = rdma_resp; - - /* RPC-over-RDMA V1 replies never have a Read list. */ - p += rpcrdma_fixed_maxsz + 1; - - /* Skip Write list. */ - while (*p++ != xdr_zero) { - nsegs = be32_to_cpup(p++); - p += nsegs * rpcrdma_segment_maxsz; - } - - /* Skip Reply chunk. */ - if (*p++ != xdr_zero) { - nsegs = be32_to_cpup(p++); - p += nsegs * rpcrdma_segment_maxsz; - } - - return (unsigned long)p - (unsigned long)rdma_resp; -} - -void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) -{ - struct rpcrdma_write_array *ary; - - /* no read-list */ - rmsgp->rm_body.rm_chunks[0] = xdr_zero; - - /* write-array discrim */ - ary = (struct rpcrdma_write_array *) - &rmsgp->rm_body.rm_chunks[1]; - ary->wc_discrim = xdr_one; - ary->wc_nchunks = cpu_to_be32(chunks); - - /* write-list terminator */ - ary->wc_array[chunks].wc_target.rs_handle = xdr_zero; - - /* reply-array discriminator */ - ary->wc_array[chunks].wc_target.rs_length = xdr_zero; -} - -void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary, - int chunks) -{ - ary->wc_discrim = xdr_one; - ary->wc_nchunks = cpu_to_be32(chunks); -} - -void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, - int chunk_no, - __be32 rs_handle, - __be64 rs_offset, - u32 write_len) -{ - struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target; - seg->rs_handle = rs_handle; - seg->rs_offset = rs_offset; - seg->rs_length = cpu_to_be32(write_len); -} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index f7b2daf72a86..27a99bf5b1a6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -558,33 +558,85 @@ static void rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_arg.buflen = head->arg.buflen; } +static void svc_rdma_send_error(struct svcxprt_rdma *xprt, + __be32 *rdma_argp, int status) +{ + struct svc_rdma_op_ctxt *ctxt; + __be32 *p, *err_msgp; + unsigned int length; + struct page *page; + int ret; + + ret = svc_rdma_repost_recv(xprt, GFP_KERNEL); + if (ret) + return; + + page = alloc_page(GFP_KERNEL); + if (!page) + return; + err_msgp = page_address(page); + + p = err_msgp; + *p++ = *rdma_argp; + *p++ = *(rdma_argp + 1); + *p++ = xprt->sc_fc_credits; + *p++ = rdma_error; + if (status == -EPROTONOSUPPORT) { + *p++ = err_vers; + *p++ = rpcrdma_version; + *p++ = rpcrdma_version; + } else { + *p++ = err_chunk; + } + length = (unsigned long)p - (unsigned long)err_msgp; + + /* Map transport header; no RPC message payload */ + ctxt = svc_rdma_get_context(xprt); + ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length); + if (ret) { + dprintk("svcrdma: Error %d mapping send for protocol error\n", + ret); + return; + } + + ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0); + if (ret) { + dprintk("svcrdma: Error %d posting send for protocol error\n", + ret); + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + } +} + /* By convention, backchannel calls arrive via rdma_msg type * messages, and never populate the chunk lists. This makes * the RPC/RDMA header small and fixed in size, so it is * straightforward to check the RPC header's direction field. */ -static bool -svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp) +static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, + __be32 *rdma_resp) { - __be32 *p = (__be32 *)rmsgp; + __be32 *p; if (!xprt->xpt_bc_xprt) return false; - if (rmsgp->rm_type != rdma_msg) + p = rdma_resp + 3; + if (*p++ != rdma_msg) return false; - if (rmsgp->rm_body.rm_chunks[0] != xdr_zero) + + if (*p++ != xdr_zero) return false; - if (rmsgp->rm_body.rm_chunks[1] != xdr_zero) + if (*p++ != xdr_zero) return false; - if (rmsgp->rm_body.rm_chunks[2] != xdr_zero) + if (*p++ != xdr_zero) return false; - /* sanity */ - if (p[7] != rmsgp->rm_xid) + /* XID sanity */ + if (*p++ != *rdma_resp) return false; /* call direction */ - if (p[8] == cpu_to_be32(RPC_CALL)) + if (*p == cpu_to_be32(RPC_CALL)) return false; return true; @@ -650,8 +702,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_drop; rqstp->rq_xprt_hlen = ret; - if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) { - ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp, + if (svc_rdma_is_backchannel_reply(xprt, &rmsgp->rm_xid)) { + ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, + &rmsgp->rm_xid, &rqstp->rq_arg); svc_rdma_put_context(ctxt, 0); if (ret) @@ -686,7 +739,7 @@ complete: return ret; out_err: - svc_rdma_send_error(rdma_xprt, rmsgp, ret); + svc_rdma_send_error(rdma_xprt, &rmsgp->rm_xid, ret); svc_rdma_put_context(ctxt, 0); return 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c new file mode 100644 index 000000000000..0cf620277693 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2016 Oracle. All rights reserved. + * + * Use the core R/W API to move RPC-over-RDMA Read and Write chunks. + */ + +#include <linux/sunrpc/rpc_rdma.h> +#include <linux/sunrpc/svc_rdma.h> +#include <linux/sunrpc/debug.h> + +#include <rdma/rw.h> + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* Each R/W context contains state for one chain of RDMA Read or + * Write Work Requests. + * + * Each WR chain handles a single contiguous server-side buffer, + * because scatterlist entries after the first have to start on + * page alignment. xdr_buf iovecs cannot guarantee alignment. + * + * Each WR chain handles only one R_key. Each RPC-over-RDMA segment + * from a client may contain a unique R_key, so each WR chain moves + * up to one segment at a time. + * + * The scatterlist makes this data structure over 4KB in size. To + * make it less likely to fail, and to handle the allocation for + * smaller I/O requests without disabling bottom-halves, these + * contexts are created on demand, but cached and reused until the + * controlling svcxprt_rdma is destroyed. + */ +struct svc_rdma_rw_ctxt { + struct list_head rw_list; + struct rdma_rw_ctx rw_ctx; + int rw_nents; + struct sg_table rw_sg_table; + struct scatterlist rw_first_sgl[0]; +}; + +static inline struct svc_rdma_rw_ctxt * +svc_rdma_next_ctxt(struct list_head *list) +{ + return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt, + rw_list); +} + +static struct svc_rdma_rw_ctxt * +svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) +{ + struct svc_rdma_rw_ctxt *ctxt; + + spin_lock(&rdma->sc_rw_ctxt_lock); + + ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts); + if (ctxt) { + list_del(&ctxt->rw_list); + spin_unlock(&rdma->sc_rw_ctxt_lock); + } else { + spin_unlock(&rdma->sc_rw_ctxt_lock); + ctxt = kmalloc(sizeof(*ctxt) + + SG_CHUNK_SIZE * sizeof(struct scatterlist), + GFP_KERNEL); + if (!ctxt) + goto out; + INIT_LIST_HEAD(&ctxt->rw_list); + } + + ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; + if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, + ctxt->rw_sg_table.sgl)) { + kfree(ctxt); + ctxt = NULL; + } +out: + return ctxt; +} + +static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt) +{ + sg_free_table_chained(&ctxt->rw_sg_table, true); + + spin_lock(&rdma->sc_rw_ctxt_lock); + list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts); + spin_unlock(&rdma->sc_rw_ctxt_lock); +} + +/** + * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts + * @rdma: transport about to be destroyed + * + */ +void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_rw_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) { + list_del(&ctxt->rw_list); + kfree(ctxt); + } +} + +/* A chunk context tracks all I/O for moving one Read or Write + * chunk. This is a a set of rdma_rw's that handle data movement + * for all segments of one chunk. + * + * These are small, acquired with a single allocator call, and + * no more than one is needed per chunk. They are allocated on + * demand, and not cached. + */ +struct svc_rdma_chunk_ctxt { + struct ib_cqe cc_cqe; + struct svcxprt_rdma *cc_rdma; + struct list_head cc_rwctxts; + int cc_sqecount; + enum dma_data_direction cc_dir; +}; + +static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc, + enum dma_data_direction dir) +{ + cc->cc_rdma = rdma; + svc_xprt_get(&rdma->sc_xprt); + + INIT_LIST_HEAD(&cc->cc_rwctxts); + cc->cc_sqecount = 0; + cc->cc_dir = dir; +} + +static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc) +{ + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_rdma_rw_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { + list_del(&ctxt->rw_list); + + rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, ctxt->rw_sg_table.sgl, + ctxt->rw_nents, cc->cc_dir); + svc_rdma_put_rw_ctxt(rdma, ctxt); + } + svc_xprt_put(&rdma->sc_xprt); +} + +/* State for sending a Write or Reply chunk. + * - Tracks progress of writing one chunk over all its segments + * - Stores arguments for the SGL constructor functions + */ +struct svc_rdma_write_info { + /* write state of this chunk */ + unsigned int wi_seg_off; + unsigned int wi_seg_no; + unsigned int wi_nsegs; + __be32 *wi_segs; + + /* SGL constructor arguments */ + struct xdr_buf *wi_xdr; + unsigned char *wi_base; + unsigned int wi_next_off; + + struct svc_rdma_chunk_ctxt wi_cc; +}; + +static struct svc_rdma_write_info * +svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) +{ + struct svc_rdma_write_info *info; + + info = kmalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return info; + + info->wi_seg_off = 0; + info->wi_seg_no = 0; + info->wi_nsegs = be32_to_cpup(++chunk); + info->wi_segs = ++chunk; + svc_rdma_cc_init(rdma, &info->wi_cc, DMA_TO_DEVICE); + return info; +} + +static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) +{ + svc_rdma_cc_release(&info->wi_cc); + kfree(info); +} + +/** + * svc_rdma_write_done - Write chunk completion + * @cq: controlling Completion Queue + * @wc: Work Completion + * + * Pages under I/O are freed by a subsequent Send completion. + */ +static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_chunk_ctxt *cc = + container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_rdma_write_info *info = + container_of(cc, struct svc_rdma_write_info, wi_cc); + + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); + wake_up(&rdma->sc_send_wait); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: write ctx: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); + } + + svc_rdma_write_info_free(info); +} + +/* This function sleeps when the transport's Send Queue is congested. + * + * Assumptions: + * - If ib_post_send() succeeds, only one completion is expected, + * even if one or more WRs are flushed. This is true when posting + * an rdma_rw_ctx or when posting a single signaled WR. + */ +static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) +{ + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_xprt *xprt = &rdma->sc_xprt; + struct ib_send_wr *first_wr, *bad_wr; + struct list_head *tmp; + struct ib_cqe *cqe; + int ret; + + first_wr = NULL; + cqe = &cc->cc_cqe; + list_for_each(tmp, &cc->cc_rwctxts) { + struct svc_rdma_rw_ctxt *ctxt; + + ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list); + first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, cqe, first_wr); + cqe = NULL; + } + + do { + if (atomic_sub_return(cc->cc_sqecount, + &rdma->sc_sq_avail) > 0) { + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) + break; + return 0; + } + + atomic_inc(&rdma_stat_sq_starve); + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); + wait_event(rdma->sc_send_wait, + atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); + } while (1); + + pr_err("svcrdma: ib_post_send failed (%d)\n", ret); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + + /* If even one was posted, there will be a completion. */ + if (bad_wr != first_wr) + return 0; + + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); + wake_up(&rdma->sc_send_wait); + return -ENOTCONN; +} + +/* Build and DMA-map an SGL that covers one kvec in an xdr_buf + */ +static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info, + unsigned int len, + struct svc_rdma_rw_ctxt *ctxt) +{ + struct scatterlist *sg = ctxt->rw_sg_table.sgl; + + sg_set_buf(&sg[0], info->wi_base, len); + info->wi_base += len; + + ctxt->rw_nents = 1; +} + +/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist. + */ +static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, + unsigned int remaining, + struct svc_rdma_rw_ctxt *ctxt) +{ + unsigned int sge_no, sge_bytes, page_off, page_no; + struct xdr_buf *xdr = info->wi_xdr; + struct scatterlist *sg; + struct page **page; + + page_off = (info->wi_next_off + xdr->page_base) & ~PAGE_MASK; + page_no = (info->wi_next_off + xdr->page_base) >> PAGE_SHIFT; + page = xdr->pages + page_no; + info->wi_next_off += remaining; + sg = ctxt->rw_sg_table.sgl; + sge_no = 0; + do { + sge_bytes = min_t(unsigned int, remaining, + PAGE_SIZE - page_off); + sg_set_page(sg, *page, sge_bytes, page_off); + + remaining -= sge_bytes; + sg = sg_next(sg); + page_off = 0; + sge_no++; + page++; + } while (remaining); + + ctxt->rw_nents = sge_no; +} + +/* Construct RDMA Write WRs to send a portion of an xdr_buf containing + * an RPC Reply. + */ +static int +svc_rdma_build_writes(struct svc_rdma_write_info *info, + void (*constructor)(struct svc_rdma_write_info *info, + unsigned int len, + struct svc_rdma_rw_ctxt *ctxt), + unsigned int remaining) +{ + struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_rdma_rw_ctxt *ctxt; + __be32 *seg; + int ret; + + cc->cc_cqe.done = svc_rdma_write_done; + seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; + do { + unsigned int write_len; + u32 seg_length, seg_handle; + u64 seg_offset; + + if (info->wi_seg_no >= info->wi_nsegs) + goto out_overflow; + + seg_handle = be32_to_cpup(seg); + seg_length = be32_to_cpup(seg + 1); + xdr_decode_hyper(seg + 2, &seg_offset); + seg_offset += info->wi_seg_off; + + write_len = min(remaining, seg_length - info->wi_seg_off); + ctxt = svc_rdma_get_rw_ctxt(rdma, + (write_len >> PAGE_SHIFT) + 2); + if (!ctxt) + goto out_noctx; + + constructor(info, write_len, ctxt); + ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, ctxt->rw_sg_table.sgl, + ctxt->rw_nents, 0, seg_offset, + seg_handle, DMA_TO_DEVICE); + if (ret < 0) + goto out_initerr; + + list_add(&ctxt->rw_list, &cc->cc_rwctxts); + cc->cc_sqecount += ret; + if (write_len == seg_length - info->wi_seg_off) { + seg += 4; + info->wi_seg_no++; + info->wi_seg_off = 0; + } else { + info->wi_seg_off += write_len; + } + remaining -= write_len; + } while (remaining); + + return 0; + +out_overflow: + dprintk("svcrdma: inadequate space in Write chunk (%u)\n", + info->wi_nsegs); + return -E2BIG; + +out_noctx: + dprintk("svcrdma: no R/W ctxs available\n"); + return -ENOMEM; + +out_initerr: + svc_rdma_put_rw_ctxt(rdma, ctxt); + pr_err("svcrdma: failed to map pagelist (%d)\n", ret); + return -EIO; +} + +/* Send one of an xdr_buf's kvecs by itself. To send a Reply + * chunk, the whole RPC Reply is written back to the client. + * This function writes either the head or tail of the xdr_buf + * containing the Reply. + */ +static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info, + struct kvec *vec) +{ + info->wi_base = vec->iov_base; + return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, + vec->iov_len); +} + +/* Send an xdr_buf's page list by itself. A Write chunk is + * just the page list. a Reply chunk is the head, page list, + * and tail. This function is shared between the two types + * of chunk. + */ +static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, + struct xdr_buf *xdr) +{ + info->wi_xdr = xdr; + info->wi_next_off = 0; + return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, + xdr->page_len); +} + +/** + * svc_rdma_send_write_chunk - Write all segments in a Write chunk + * @rdma: controlling RDMA transport + * @wr_ch: Write chunk provided by client + * @xdr: xdr_buf containing the data payload + * + * Returns a non-negative number of bytes the chunk consumed, or + * %-E2BIG if the payload was larger than the Write chunk, + * %-ENOMEM if rdma_rw context pool was exhausted, + * %-ENOTCONN if posting failed (connection is lost), + * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + */ +int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, + struct xdr_buf *xdr) +{ + struct svc_rdma_write_info *info; + int ret; + + if (!xdr->page_len) + return 0; + + info = svc_rdma_write_info_alloc(rdma, wr_ch); + if (!info) + return -ENOMEM; + + ret = svc_rdma_send_xdr_pagelist(info, xdr); + if (ret < 0) + goto out_err; + + ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + if (ret < 0) + goto out_err; + return xdr->page_len; + +out_err: + svc_rdma_write_info_free(info); + return ret; +} + +/** + * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk + * @rdma: controlling RDMA transport + * @rp_ch: Reply chunk provided by client + * @writelist: true if client provided a Write list + * @xdr: xdr_buf containing an RPC Reply + * + * Returns a non-negative number of bytes the chunk consumed, or + * %-E2BIG if the payload was larger than the Reply chunk, + * %-ENOMEM if rdma_rw context pool was exhausted, + * %-ENOTCONN if posting failed (connection is lost), + * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + */ +int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch, + bool writelist, struct xdr_buf *xdr) +{ + struct svc_rdma_write_info *info; + int consumed, ret; + + info = svc_rdma_write_info_alloc(rdma, rp_ch); + if (!info) + return -ENOMEM; + + ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]); + if (ret < 0) + goto out_err; + consumed = xdr->head[0].iov_len; + + /* Send the page list in the Reply chunk only if the + * client did not provide Write chunks. + */ + if (!writelist && xdr->page_len) { + ret = svc_rdma_send_xdr_pagelist(info, xdr); + if (ret < 0) + goto out_err; + consumed += xdr->page_len; + } + + if (xdr->tail[0].iov_len) { + ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]); + if (ret < 0) + goto out_err; + consumed += xdr->tail[0].iov_len; + } + + ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + if (ret < 0) + goto out_err; + return consumed; + +out_err: + svc_rdma_write_info_free(info); + return ret; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 515221b16d09..1736337f3a55 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2016 Oracle. All rights reserved. * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * @@ -40,6 +41,63 @@ * Author: Tom Tucker <tom@opengridcomputing.com> */ +/* Operation + * + * The main entry point is svc_rdma_sendto. This is called by the + * RPC server when an RPC Reply is ready to be transmitted to a client. + * + * The passed-in svc_rqst contains a struct xdr_buf which holds an + * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA + * transport header, post all Write WRs needed for this Reply, then post + * a Send WR conveying the transport header and the RPC message itself to + * the client. + * + * svc_rdma_sendto must fully transmit the Reply before returning, as + * the svc_rqst will be recycled as soon as sendto returns. Remaining + * resources referred to by the svc_rqst are also recycled at that time. + * Therefore any resources that must remain longer must be detached + * from the svc_rqst and released later. + * + * Page Management + * + * The I/O that performs Reply transmission is asynchronous, and may + * complete well after sendto returns. Thus pages under I/O must be + * removed from the svc_rqst before sendto returns. + * + * The logic here depends on Send Queue and completion ordering. Since + * the Send WR is always posted last, it will always complete last. Thus + * when it completes, it is guaranteed that all previous Write WRs have + * also completed. + * + * Write WRs are constructed and posted. Each Write segment gets its own + * svc_rdma_rw_ctxt, allowing the Write completion handler to find and + * DMA-unmap the pages under I/O for that Write segment. The Write + * completion handler does not release any pages. + * + * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt. + * The ownership of all of the Reply's pages are transferred into that + * ctxt, the Send WR is posted, and sendto returns. + * + * The svc_rdma_op_ctxt is presented when the Send WR completes. The + * Send completion handler finally releases the Reply's pages. + * + * This mechanism also assumes that completions on the transport's Send + * Completion Queue do not run in parallel. Otherwise a Write completion + * and Send completion running at the same time could release pages that + * are still DMA-mapped. + * + * Error Handling + * + * - If the Send WR is posted successfully, it will either complete + * successfully, or get flushed. Either way, the Send completion + * handler releases the Reply's pages. + * - If the Send WR cannot be not posted, the forward path releases + * the Reply's pages. + * + * This handles the case, without the use of page reference counting, + * where two different Write segments send portions of the same page. + */ + #include <linux/sunrpc/debug.h> #include <linux/sunrpc/rpc_rdma.h> #include <linux/spinlock.h> @@ -55,113 +113,141 @@ static u32 xdr_padsize(u32 len) return (len & 3) ? (4 - (len & 3)) : 0; } -int svc_rdma_map_xdr(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct svc_rdma_req_map *vec, - bool write_chunk_present) +/* Returns length of transport header, in bytes. + */ +static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp) { - int sge_no; - u32 sge_bytes; - u32 page_bytes; - u32 page_off; - int page_no; - - if (xdr->len != - (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) { - pr_err("svcrdma: %s: XDR buffer length error\n", __func__); - return -EIO; - } + unsigned int nsegs; + __be32 *p; - /* Skip the first sge, this is for the RPCRDMA header */ - sge_no = 1; + p = rdma_resp; + + /* RPC-over-RDMA V1 replies never have a Read list. */ + p += rpcrdma_fixed_maxsz + 1; - /* Head SGE */ - vec->sge[sge_no].iov_base = xdr->head[0].iov_base; - vec->sge[sge_no].iov_len = xdr->head[0].iov_len; - sge_no++; - - /* pages SGE */ - page_no = 0; - page_bytes = xdr->page_len; - page_off = xdr->page_base; - while (page_bytes) { - vec->sge[sge_no].iov_base = - page_address(xdr->pages[page_no]) + page_off; - sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); - page_bytes -= sge_bytes; - vec->sge[sge_no].iov_len = sge_bytes; - - sge_no++; - page_no++; - page_off = 0; /* reset for next time through loop */ + /* Skip Write list. */ + while (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; } - /* Tail SGE */ - if (xdr->tail[0].iov_len) { - unsigned char *base = xdr->tail[0].iov_base; - size_t len = xdr->tail[0].iov_len; - u32 xdr_pad = xdr_padsize(xdr->page_len); + /* Skip Reply chunk. */ + if (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } - if (write_chunk_present && xdr_pad) { - base += xdr_pad; - len -= xdr_pad; - } + return (unsigned long)p - (unsigned long)rdma_resp; +} - if (len) { - vec->sge[sge_no].iov_base = base; - vec->sge[sge_no].iov_len = len; - sge_no++; +/* One Write chunk is copied from Call transport header to Reply + * transport header. Each segment's length field is updated to + * reflect number of bytes consumed in the segment. + * + * Returns number of segments in this chunk. + */ +static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src, + unsigned int remaining) +{ + unsigned int i, nsegs; + u32 seg_len; + + /* Write list discriminator */ + *dst++ = *src++; + + /* number of segments in this chunk */ + nsegs = be32_to_cpup(src); + *dst++ = *src++; + + for (i = nsegs; i; i--) { + /* segment's RDMA handle */ + *dst++ = *src++; + + /* bytes returned in this segment */ + seg_len = be32_to_cpu(*src); + if (remaining >= seg_len) { + /* entire segment was consumed */ + *dst = *src; + remaining -= seg_len; + } else { + /* segment only partly filled */ + *dst = cpu_to_be32(remaining); + remaining = 0; } - } + dst++; src++; - dprintk("svcrdma: %s: sge_no %d page_no %d " - "page_base %u page_len %u head_len %zu tail_len %zu\n", - __func__, sge_no, page_no, xdr->page_base, xdr->page_len, - xdr->head[0].iov_len, xdr->tail[0].iov_len); + /* segment's RDMA offset */ + *dst++ = *src++; + *dst++ = *src++; + } - vec->count = sge_no; - return 0; + return nsegs; } -static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - u32 xdr_off, size_t len, int dir) +/* The client provided a Write list in the Call message. Fill in + * the segments in the first Write chunk in the Reply's transport + * header with the number of bytes consumed in each segment. + * Remaining chunks are returned unused. + * + * Assumptions: + * - Client has provided only one Write chunk + */ +static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch, + unsigned int consumed) { - struct page *page; - dma_addr_t dma_addr; - if (xdr_off < xdr->head[0].iov_len) { - /* This offset is in the head */ - xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK; - page = virt_to_page(xdr->head[0].iov_base); - } else { - xdr_off -= xdr->head[0].iov_len; - if (xdr_off < xdr->page_len) { - /* This offset is in the page list */ - xdr_off += xdr->page_base; - page = xdr->pages[xdr_off >> PAGE_SHIFT]; - xdr_off &= ~PAGE_MASK; - } else { - /* This offset is in the tail */ - xdr_off -= xdr->page_len; - xdr_off += (unsigned long) - xdr->tail[0].iov_base & ~PAGE_MASK; - page = virt_to_page(xdr->tail[0].iov_base); - } + unsigned int nsegs; + __be32 *p, *q; + + /* RPC-over-RDMA V1 replies never have a Read list. */ + p = rdma_resp + rpcrdma_fixed_maxsz + 1; + + q = wr_ch; + while (*q != xdr_zero) { + nsegs = xdr_encode_write_chunk(p, q, consumed); + q += 2 + nsegs * rpcrdma_segment_maxsz; + p += 2 + nsegs * rpcrdma_segment_maxsz; + consumed = 0; } - dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off, - min_t(size_t, PAGE_SIZE, len), dir); - return dma_addr; + + /* Terminate Write list */ + *p++ = xdr_zero; + + /* Reply chunk discriminator; may be replaced later */ + *p = xdr_zero; +} + +/* The client provided a Reply chunk in the Call message. Fill in + * the segments in the Reply chunk in the Reply message with the + * number of bytes consumed in each segment. + * + * Assumptions: + * - Reply can always fit in the provided Reply chunk + */ +static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch, + unsigned int consumed) +{ + __be32 *p; + + /* Find the Reply chunk in the Reply's xprt header. + * RPC-over-RDMA V1 replies never have a Read list. + */ + p = rdma_resp + rpcrdma_fixed_maxsz + 1; + + /* Skip past Write list */ + while (*p++ != xdr_zero) + p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + + xdr_encode_write_chunk(p, rp_ch, consumed); } /* Parse the RPC Call's transport header. */ -static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp, - struct rpcrdma_write_array **write, - struct rpcrdma_write_array **reply) +static void svc_rdma_get_write_arrays(__be32 *rdma_argp, + __be32 **write, __be32 **reply) { __be32 *p; - p = (__be32 *)&rmsgp->rm_body.rm_chunks[0]; + p = rdma_argp + rpcrdma_fixed_maxsz; /* Read list */ while (*p++ != xdr_zero) @@ -169,7 +255,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp, /* Write list */ if (*p != xdr_zero) { - *write = (struct rpcrdma_write_array *)p; + *write = p; while (*p++ != xdr_zero) p += 1 + be32_to_cpu(*p) * 4; } else { @@ -179,7 +265,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp, /* Reply chunk */ if (*p != xdr_zero) - *reply = (struct rpcrdma_write_array *)p; + *reply = p; else *reply = NULL; } @@ -189,360 +275,321 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp, * Invalidate, and responder chooses one rkey to invalidate. * * Find a candidate rkey to invalidate when sending a reply. Picks the - * first rkey it finds in the chunks lists. + * first R_key it finds in the chunk lists. * * Returns zero if RPC's chunk lists are empty. */ -static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp, - struct rpcrdma_write_array *wr_ary, - struct rpcrdma_write_array *rp_ary) +static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp, + __be32 *wr_lst, __be32 *rp_ch) { - struct rpcrdma_read_chunk *rd_ary; - struct rpcrdma_segment *arg_ch; + __be32 *p; - rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0]; - if (rd_ary->rc_discrim != xdr_zero) - return be32_to_cpu(rd_ary->rc_target.rs_handle); + p = rdma_argp + rpcrdma_fixed_maxsz; + if (*p != xdr_zero) + p += 2; + else if (wr_lst && be32_to_cpup(wr_lst + 1)) + p = wr_lst + 2; + else if (rp_ch && be32_to_cpup(rp_ch + 1)) + p = rp_ch + 2; + else + return 0; + return be32_to_cpup(p); +} - if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) { - arg_ch = &wr_ary->wc_array[0].wc_target; - return be32_to_cpu(arg_ch->rs_handle); - } +/* ib_dma_map_page() is used here because svc_rdma_dma_unmap() + * is used during completion to DMA-unmap this memory, and + * it uses ib_dma_unmap_page() exclusively. + */ +static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, + unsigned int sge_no, + unsigned char *base, + unsigned int len) +{ + unsigned long offset = (unsigned long)base & ~PAGE_MASK; + struct ib_device *dev = rdma->sc_cm_id->device; + dma_addr_t dma_addr; - if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) { - arg_ch = &rp_ary->wc_array[0].wc_target; - return be32_to_cpu(arg_ch->rs_handle); - } + dma_addr = ib_dma_map_page(dev, virt_to_page(base), + offset, len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(dev, dma_addr)) + return -EIO; + ctxt->sge[sge_no].addr = dma_addr; + ctxt->sge[sge_no].length = len; + ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; + svc_rdma_count_mappings(rdma, ctxt); return 0; } -/* Assumptions: - * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE - */ -static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, - u32 rmr, u64 to, - u32 xdr_off, int write_len, - struct svc_rdma_req_map *vec) +static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, + unsigned int sge_no, + struct page *page, + unsigned int offset, + unsigned int len) { - struct ib_rdma_wr write_wr; - struct ib_sge *sge; - int xdr_sge_no; - int sge_no; - int sge_bytes; - int sge_off; - int bc; - struct svc_rdma_op_ctxt *ctxt; + struct ib_device *dev = rdma->sc_cm_id->device; + dma_addr_t dma_addr; - if (vec->count > RPCSVC_MAXPAGES) { - pr_err("svcrdma: Too many pages (%lu)\n", vec->count); + dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(dev, dma_addr)) return -EIO; - } - dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " - "write_len=%d, vec->sge=%p, vec->count=%lu\n", - rmr, (unsigned long long)to, xdr_off, - write_len, vec->sge, vec->count); + ctxt->sge[sge_no].addr = dma_addr; + ctxt->sge[sge_no].length = len; + ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; + svc_rdma_count_mappings(rdma, ctxt); + return 0; +} - ctxt = svc_rdma_get_context(xprt); +/** + * svc_rdma_map_reply_hdr - DMA map the transport header buffer + * @rdma: controlling transport + * @ctxt: op_ctxt for the Send WR + * @rdma_resp: buffer containing transport header + * @len: length of transport header + * + * Returns: + * %0 if the header is DMA mapped, + * %-EIO if DMA mapping failed. + */ +int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, + __be32 *rdma_resp, + unsigned int len) +{ ctxt->direction = DMA_TO_DEVICE; - sge = ctxt->sge; - - /* Find the SGE associated with xdr_off */ - for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count; - xdr_sge_no++) { - if (vec->sge[xdr_sge_no].iov_len > bc) - break; - bc -= vec->sge[xdr_sge_no].iov_len; - } - - sge_off = bc; - bc = write_len; - sge_no = 0; - - /* Copy the remaining SGE */ - while (bc != 0) { - sge_bytes = min_t(size_t, - bc, vec->sge[xdr_sge_no].iov_len-sge_off); - sge[sge_no].length = sge_bytes; - sge[sge_no].addr = - dma_map_xdr(xprt, &rqstp->rq_res, xdr_off, - sge_bytes, DMA_TO_DEVICE); - xdr_off += sge_bytes; - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - sge[sge_no].addr)) - goto err; - svc_rdma_count_mappings(xprt, ctxt); - sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; - ctxt->count++; - sge_off = 0; - sge_no++; - xdr_sge_no++; - if (xdr_sge_no > vec->count) { - pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no); - goto err; - } - bc -= sge_bytes; - if (sge_no == xprt->sc_max_sge) - break; - } - - /* Prepare WRITE WR */ - memset(&write_wr, 0, sizeof write_wr); - ctxt->cqe.done = svc_rdma_wc_write; - write_wr.wr.wr_cqe = &ctxt->cqe; - write_wr.wr.sg_list = &sge[0]; - write_wr.wr.num_sge = sge_no; - write_wr.wr.opcode = IB_WR_RDMA_WRITE; - write_wr.wr.send_flags = IB_SEND_SIGNALED; - write_wr.rkey = rmr; - write_wr.remote_addr = to; - - /* Post It */ - atomic_inc(&rdma_stat_write); - if (svc_rdma_send(xprt, &write_wr.wr)) - goto err; - return write_len - bc; - err: - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 0); - return -EIO; + ctxt->pages[0] = virt_to_page(rdma_resp); + ctxt->count = 1; + return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len); } -noinline -static int send_write_chunks(struct svcxprt_rdma *xprt, - struct rpcrdma_write_array *wr_ary, - struct rpcrdma_msg *rdma_resp, - struct svc_rqst *rqstp, - struct svc_rdma_req_map *vec) +/* Load the xdr_buf into the ctxt's sge array, and DMA map each + * element as it is added. + * + * Returns the number of sge elements loaded on success, or + * a negative errno on failure. + */ +static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, + struct xdr_buf *xdr, __be32 *wr_lst) { - u32 xfer_len = rqstp->rq_res.page_len; - int write_len; - u32 xdr_off; - int chunk_off; - int chunk_no; - int nchunks; - struct rpcrdma_write_array *res_ary; + unsigned int len, sge_no, remaining, page_off; + struct page **ppages; + unsigned char *base; + u32 xdr_pad; int ret; - res_ary = (struct rpcrdma_write_array *) - &rdma_resp->rm_body.rm_chunks[1]; - - /* Write chunks start at the pagelist */ - nchunks = be32_to_cpu(wr_ary->wc_nchunks); - for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; - xfer_len && chunk_no < nchunks; - chunk_no++) { - struct rpcrdma_segment *arg_ch; - u64 rs_offset; - - arg_ch = &wr_ary->wc_array[chunk_no].wc_target; - write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length)); - - /* Prepare the response chunk given the length actually - * written */ - xdr_decode_hyper((__be32 *)&arg_ch->rs_offset, &rs_offset); - svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, - arg_ch->rs_handle, - arg_ch->rs_offset, - write_len); - chunk_off = 0; - while (write_len) { - ret = send_write(xprt, rqstp, - be32_to_cpu(arg_ch->rs_handle), - rs_offset + chunk_off, - xdr_off, - write_len, - vec); - if (ret <= 0) - goto out_err; - chunk_off += ret; - xdr_off += ret; - xfer_len -= ret; - write_len -= ret; + sge_no = 1; + + ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, + xdr->head[0].iov_base, + xdr->head[0].iov_len); + if (ret < 0) + return ret; + + /* If a Write chunk is present, the xdr_buf's page list + * is not included inline. However the Upper Layer may + * have added XDR padding in the tail buffer, and that + * should not be included inline. + */ + if (wr_lst) { + base = xdr->tail[0].iov_base; + len = xdr->tail[0].iov_len; + xdr_pad = xdr_padsize(xdr->page_len); + + if (len && xdr_pad) { + base += xdr_pad; + len -= xdr_pad; } + + goto tail; + } + + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + page_off = xdr->page_base & ~PAGE_MASK; + remaining = xdr->page_len; + while (remaining) { + len = min_t(u32, PAGE_SIZE - page_off, remaining); + + ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++, + *ppages++, page_off, len); + if (ret < 0) + return ret; + + remaining -= len; + page_off = 0; } - /* Update the req with the number of chunks actually used */ - svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no); - return rqstp->rq_res.page_len; + base = xdr->tail[0].iov_base; + len = xdr->tail[0].iov_len; +tail: + if (len) { + ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len); + if (ret < 0) + return ret; + } -out_err: - pr_err("svcrdma: failed to send write chunks, rc=%d\n", ret); - return -EIO; + return sge_no - 1; } -noinline -static int send_reply_chunks(struct svcxprt_rdma *xprt, - struct rpcrdma_write_array *rp_ary, - struct rpcrdma_msg *rdma_resp, - struct svc_rqst *rqstp, - struct svc_rdma_req_map *vec) +/* The svc_rqst and all resources it owns are released as soon as + * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt + * so they are released by the Send completion handler. + */ +static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *ctxt) { - u32 xfer_len = rqstp->rq_res.len; - int write_len; - u32 xdr_off; - int chunk_no; - int chunk_off; - int nchunks; - struct rpcrdma_segment *ch; - struct rpcrdma_write_array *res_ary; - int ret; + int i, pages = rqstp->rq_next_page - rqstp->rq_respages; - /* XXX: need to fix when reply lists occur with read-list and or - * write-list */ - res_ary = (struct rpcrdma_write_array *) - &rdma_resp->rm_body.rm_chunks[2]; - - /* xdr offset starts at RPC message */ - nchunks = be32_to_cpu(rp_ary->wc_nchunks); - for (xdr_off = 0, chunk_no = 0; - xfer_len && chunk_no < nchunks; - chunk_no++) { - u64 rs_offset; - ch = &rp_ary->wc_array[chunk_no].wc_target; - write_len = min(xfer_len, be32_to_cpu(ch->rs_length)); - - /* Prepare the reply chunk given the length actually - * written */ - xdr_decode_hyper((__be32 *)&ch->rs_offset, &rs_offset); - svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, - ch->rs_handle, ch->rs_offset, - write_len); - chunk_off = 0; - while (write_len) { - ret = send_write(xprt, rqstp, - be32_to_cpu(ch->rs_handle), - rs_offset + chunk_off, - xdr_off, - write_len, - vec); - if (ret <= 0) - goto out_err; - chunk_off += ret; - xdr_off += ret; - xfer_len -= ret; - write_len -= ret; - } + ctxt->count += pages; + for (i = 0; i < pages; i++) { + ctxt->pages[i + 1] = rqstp->rq_respages[i]; + rqstp->rq_respages[i] = NULL; } - /* Update the req with the number of chunks actually used */ - svc_rdma_xdr_encode_reply_array(res_ary, chunk_no); + rqstp->rq_next_page = rqstp->rq_respages + 1; +} - return rqstp->rq_res.len; +/** + * svc_rdma_post_send_wr - Set up and post one Send Work Request + * @rdma: controlling transport + * @ctxt: op_ctxt for transmitting the Send WR + * @num_sge: number of SGEs to send + * @inv_rkey: R_key argument to Send With Invalidate, or zero + * + * Returns: + * %0 if the Send* was posted successfully, + * %-ENOTCONN if the connection was lost or dropped, + * %-EINVAL if there was a problem with the Send we built, + * %-ENOMEM if ib_post_send failed. + */ +int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, int num_sge, + u32 inv_rkey) +{ + struct ib_send_wr *send_wr = &ctxt->send_wr; -out_err: - pr_err("svcrdma: failed to send reply chunks, rc=%d\n", ret); - return -EIO; + dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge); + + send_wr->next = NULL; + ctxt->cqe.done = svc_rdma_wc_send; + send_wr->wr_cqe = &ctxt->cqe; + send_wr->sg_list = ctxt->sge; + send_wr->num_sge = num_sge; + send_wr->send_flags = IB_SEND_SIGNALED; + if (inv_rkey) { + send_wr->opcode = IB_WR_SEND_WITH_INV; + send_wr->ex.invalidate_rkey = inv_rkey; + } else { + send_wr->opcode = IB_WR_SEND; + } + + return svc_rdma_send(rdma, send_wr); } -/* This function prepares the portion of the RPCRDMA message to be - * sent in the RDMA_SEND. This function is called after data sent via - * RDMA has already been transmitted. There are three cases: - * - The RPCRDMA header, RPC header, and payload are all sent in a - * single RDMA_SEND. This is the "inline" case. - * - The RPCRDMA header and some portion of the RPC header and data - * are sent via this RDMA_SEND and another portion of the data is - * sent via RDMA. - * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC - * header and data are all transmitted via RDMA. - * In all three cases, this function prepares the RPCRDMA header in - * sge[0], the 'type' parameter indicates the type to place in the - * RPCRDMA header, and the 'byte_count' field indicates how much of - * the XDR to include in this RDMA_SEND. NB: The offset of the payload - * to send is zero in the XDR. +/* Prepare the portion of the RPC Reply that will be transmitted + * via RDMA Send. The RPC-over-RDMA transport header is prepared + * in sge[0], and the RPC xdr_buf is prepared in following sges. + * + * Depending on whether a Write list or Reply chunk is present, + * the server may send all, a portion of, or none of the xdr_buf. + * In the latter case, only the transport header (sge[0]) is + * transmitted. + * + * RDMA Send is the last step of transmitting an RPC reply. Pages + * involved in the earlier RDMA Writes are here transferred out + * of the rqstp and into the ctxt's page array. These pages are + * DMA unmapped by each Write completion, but the subsequent Send + * completion finally releases these pages. + * + * Assumptions: + * - The Reply's transport header will never be larger than a page. */ -static int send_reply(struct svcxprt_rdma *rdma, - struct svc_rqst *rqstp, - struct page *page, - struct rpcrdma_msg *rdma_resp, - struct svc_rdma_req_map *vec, - int byte_count, - u32 inv_rkey) +static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, + __be32 *rdma_argp, __be32 *rdma_resp, + struct svc_rqst *rqstp, + __be32 *wr_lst, __be32 *rp_ch) { struct svc_rdma_op_ctxt *ctxt; - struct ib_send_wr send_wr; - u32 xdr_off; - int sge_no; - int sge_bytes; - int page_no; - int pages; - int ret = -EIO; - - /* Prepare the context */ + u32 inv_rkey; + int ret; + + dprintk("svcrdma: sending %s reply: head=%zu, pagelen=%u, tail=%zu\n", + (rp_ch ? "RDMA_NOMSG" : "RDMA_MSG"), + rqstp->rq_res.head[0].iov_len, + rqstp->rq_res.page_len, + rqstp->rq_res.tail[0].iov_len); + ctxt = svc_rdma_get_context(rdma); - ctxt->direction = DMA_TO_DEVICE; - ctxt->pages[0] = page; - ctxt->count = 1; - /* Prepare the SGE for the RPCRDMA Header */ - ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; - ctxt->sge[0].length = - svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp); - ctxt->sge[0].addr = - ib_dma_map_page(rdma->sc_cm_id->device, page, 0, - ctxt->sge[0].length, DMA_TO_DEVICE); - if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) + ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, + svc_rdma_reply_hdr_len(rdma_resp)); + if (ret < 0) goto err; - svc_rdma_count_mappings(rdma, ctxt); - - ctxt->direction = DMA_TO_DEVICE; - /* Map the payload indicated by 'byte_count' */ - xdr_off = 0; - for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { - sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); - byte_count -= sge_bytes; - ctxt->sge[sge_no].addr = - dma_map_xdr(rdma, &rqstp->rq_res, xdr_off, - sge_bytes, DMA_TO_DEVICE); - xdr_off += sge_bytes; - if (ib_dma_mapping_error(rdma->sc_cm_id->device, - ctxt->sge[sge_no].addr)) + if (!rp_ch) { + ret = svc_rdma_map_reply_msg(rdma, ctxt, + &rqstp->rq_res, wr_lst); + if (ret < 0) goto err; - svc_rdma_count_mappings(rdma, ctxt); - ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; - ctxt->sge[sge_no].length = sge_bytes; } - if (byte_count != 0) { - pr_err("svcrdma: Could not map %d bytes\n", byte_count); + + svc_rdma_save_io_pages(rqstp, ctxt); + + inv_rkey = 0; + if (rdma->sc_snd_w_inv) + inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch); + ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey); + if (ret) goto err; - } - /* Save all respages in the ctxt and remove them from the - * respages array. They are our pages until the I/O - * completes. + return 0; + +err: + pr_err("svcrdma: failed to post Send WR (%d)\n", ret); + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + return ret; +} + +/* Given the client-provided Write and Reply chunks, the server was not + * able to form a complete reply. Return an RDMA_ERROR message so the + * client can retire this RPC transaction. As above, the Send completion + * routine releases payload pages that were part of a previous RDMA Write. + * + * Remote Invalidation is skipped for simplicity. + */ +static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, + __be32 *rdma_resp, struct svc_rqst *rqstp) +{ + struct svc_rdma_op_ctxt *ctxt; + __be32 *p; + int ret; + + ctxt = svc_rdma_get_context(rdma); + + /* Replace the original transport header with an + * RDMA_ERROR response. XID etc are preserved. */ - pages = rqstp->rq_next_page - rqstp->rq_respages; - for (page_no = 0; page_no < pages; page_no++) { - ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; - ctxt->count++; - rqstp->rq_respages[page_no] = NULL; - } - rqstp->rq_next_page = rqstp->rq_respages + 1; + p = rdma_resp + 3; + *p++ = rdma_error; + *p = err_chunk; - if (sge_no > rdma->sc_max_sge) { - pr_err("svcrdma: Too many sges (%d)\n", sge_no); + ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 20); + if (ret < 0) goto err; - } - memset(&send_wr, 0, sizeof send_wr); - ctxt->cqe.done = svc_rdma_wc_send; - send_wr.wr_cqe = &ctxt->cqe; - send_wr.sg_list = ctxt->sge; - send_wr.num_sge = sge_no; - if (inv_rkey) { - send_wr.opcode = IB_WR_SEND_WITH_INV; - send_wr.ex.invalidate_rkey = inv_rkey; - } else - send_wr.opcode = IB_WR_SEND; - send_wr.send_flags = IB_SEND_SIGNALED; - ret = svc_rdma_send(rdma, &send_wr); + svc_rdma_save_io_pages(rqstp, ctxt); + + ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, 0); if (ret) goto err; return 0; - err: +err: + pr_err("svcrdma: failed to post Send WR (%d)\n", ret); svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 1); return ret; @@ -552,39 +599,36 @@ void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) { } +/** + * svc_rdma_sendto - Transmit an RPC reply + * @rqstp: processed RPC request, reply XDR already in ::rq_res + * + * Any resources still associated with @rqstp are released upon return. + * If no reply message was possible, the connection is closed. + * + * Returns: + * %0 if an RPC reply has been successfully posted, + * %-ENOMEM if a resource shortage occurred (connection is lost), + * %-ENOTCONN if posting failed (connection is lost). + */ int svc_rdma_sendto(struct svc_rqst *rqstp) { struct svc_xprt *xprt = rqstp->rq_xprt; struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); - struct rpcrdma_msg *rdma_argp; - struct rpcrdma_msg *rdma_resp; - struct rpcrdma_write_array *wr_ary, *rp_ary; - int ret; - int inline_bytes; + __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch; + struct xdr_buf *xdr = &rqstp->rq_res; struct page *res_page; - struct svc_rdma_req_map *vec; - u32 inv_rkey; - __be32 *p; - - dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); + int ret; - /* Get the RDMA request header. The receive logic always - * places this at the start of page 0. + /* Find the call's chunk lists to decide how to send the reply. + * Receive places the Call's xprt header at the start of page 0. */ rdma_argp = page_address(rqstp->rq_pages[0]); - svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary); - - inv_rkey = 0; - if (rdma->sc_snd_w_inv) - inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary); + svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch); - /* Build an req vec for the XDR */ - vec = svc_rdma_get_req_map(rdma); - ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL); - if (ret) - goto err0; - inline_bytes = rqstp->rq_res.len; + dprintk("svcrdma: preparing response for XID 0x%08x\n", + be32_to_cpup(rdma_argp)); /* Create the RDMA response header. xprt->xpt_mutex, * acquired in svc_send(), serializes RPC replies. The @@ -598,115 +642,57 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) goto err0; rdma_resp = page_address(res_page); - p = &rdma_resp->rm_xid; - *p++ = rdma_argp->rm_xid; - *p++ = rdma_argp->rm_vers; + p = rdma_resp; + *p++ = *rdma_argp; + *p++ = *(rdma_argp + 1); *p++ = rdma->sc_fc_credits; - *p++ = rp_ary ? rdma_nomsg : rdma_msg; + *p++ = rp_ch ? rdma_nomsg : rdma_msg; /* Start with empty chunks */ *p++ = xdr_zero; *p++ = xdr_zero; *p = xdr_zero; - /* Send any write-chunk data and build resp write-list */ - if (wr_ary) { - ret = send_write_chunks(rdma, wr_ary, rdma_resp, rqstp, vec); + if (wr_lst) { + /* XXX: Presume the client sent only one Write chunk */ + ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr); if (ret < 0) - goto err1; - inline_bytes -= ret + xdr_padsize(ret); + goto err2; + svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret); } - - /* Send any reply-list data and update resp reply-list */ - if (rp_ary) { - ret = send_reply_chunks(rdma, rp_ary, rdma_resp, rqstp, vec); + if (rp_ch) { + ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr); if (ret < 0) - goto err1; - inline_bytes -= ret; + goto err2; + svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret); } - /* Post a fresh Receive buffer _before_ sending the reply */ ret = svc_rdma_post_recv(rdma, GFP_KERNEL); if (ret) goto err1; - - ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec, - inline_bytes, inv_rkey); + ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp, + wr_lst, rp_ch); if (ret < 0) goto err0; + return 0; - svc_rdma_put_req_map(rdma, vec); - dprintk("svcrdma: send_reply returns %d\n", ret); - return ret; + err2: + if (ret != -E2BIG) + goto err1; + + ret = svc_rdma_post_recv(rdma, GFP_KERNEL); + if (ret) + goto err1; + ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp); + if (ret < 0) + goto err0; + return 0; err1: put_page(res_page); err0: - svc_rdma_put_req_map(rdma, vec); pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n", ret); - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + set_bit(XPT_CLOSE, &xprt->xpt_flags); return -ENOTCONN; } - -void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, - int status) -{ - struct ib_send_wr err_wr; - struct page *p; - struct svc_rdma_op_ctxt *ctxt; - enum rpcrdma_errcode err; - __be32 *va; - int length; - int ret; - - ret = svc_rdma_repost_recv(xprt, GFP_KERNEL); - if (ret) - return; - - p = alloc_page(GFP_KERNEL); - if (!p) - return; - va = page_address(p); - - /* XDR encode an error reply */ - err = ERR_CHUNK; - if (status == -EPROTONOSUPPORT) - err = ERR_VERS; - length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); - - ctxt = svc_rdma_get_context(xprt); - ctxt->direction = DMA_TO_DEVICE; - ctxt->count = 1; - ctxt->pages[0] = p; - - /* Prepare SGE for local address */ - ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey; - ctxt->sge[0].length = length; - ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device, - p, 0, length, DMA_TO_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) { - dprintk("svcrdma: Error mapping buffer for protocol error\n"); - svc_rdma_put_context(ctxt, 1); - return; - } - svc_rdma_count_mappings(xprt, ctxt); - - /* Prepare SEND WR */ - memset(&err_wr, 0, sizeof(err_wr)); - ctxt->cqe.done = svc_rdma_wc_send; - err_wr.wr_cqe = &ctxt->cqe; - err_wr.sg_list = ctxt->sge; - err_wr.num_sge = 1; - err_wr.opcode = IB_WR_SEND; - err_wr.send_flags = IB_SEND_SIGNALED; - - /* Post It */ - ret = svc_rdma_send(xprt, &err_wr); - if (ret) { - dprintk("svcrdma: Error %d posting send for protocol error\n", - ret); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - } -} diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index fc8f14c7bfec..a9d9cb1ba4c6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -272,85 +272,6 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) } } -static struct svc_rdma_req_map *alloc_req_map(gfp_t flags) -{ - struct svc_rdma_req_map *map; - - map = kmalloc(sizeof(*map), flags); - if (map) - INIT_LIST_HEAD(&map->free); - return map; -} - -static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt) -{ - unsigned int i; - - /* One for each receive buffer on this connection. */ - i = xprt->sc_max_requests; - - while (i--) { - struct svc_rdma_req_map *map; - - map = alloc_req_map(GFP_KERNEL); - if (!map) { - dprintk("svcrdma: No memory for request map\n"); - return false; - } - list_add(&map->free, &xprt->sc_maps); - } - return true; -} - -struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt) -{ - struct svc_rdma_req_map *map = NULL; - - spin_lock(&xprt->sc_map_lock); - if (list_empty(&xprt->sc_maps)) - goto out_empty; - - map = list_first_entry(&xprt->sc_maps, - struct svc_rdma_req_map, free); - list_del_init(&map->free); - spin_unlock(&xprt->sc_map_lock); - -out: - map->count = 0; - return map; - -out_empty: - spin_unlock(&xprt->sc_map_lock); - - /* Pre-allocation amount was incorrect */ - map = alloc_req_map(GFP_NOIO); - if (map) - goto out; - - WARN_ONCE(1, "svcrdma: empty request map list?\n"); - return NULL; -} - -void svc_rdma_put_req_map(struct svcxprt_rdma *xprt, - struct svc_rdma_req_map *map) -{ - spin_lock(&xprt->sc_map_lock); - list_add(&map->free, &xprt->sc_maps); - spin_unlock(&xprt->sc_map_lock); -} - -static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt) -{ - while (!list_empty(&xprt->sc_maps)) { - struct svc_rdma_req_map *map; - - map = list_first_entry(&xprt->sc_maps, - struct svc_rdma_req_map, free); - list_del(&map->free); - kfree(map); - } -} - /* QP event handler */ static void qp_event_handler(struct ib_event *event, void *context) { @@ -474,24 +395,6 @@ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) } /** - * svc_rdma_wc_write - Invoked by RDMA provider for each polled Write WC - * @cq: completion queue - * @wc: completed WR - * - */ -void svc_rdma_wc_write(struct ib_cq *cq, struct ib_wc *wc) -{ - struct ib_cqe *cqe = wc->wr_cqe; - struct svc_rdma_op_ctxt *ctxt; - - svc_rdma_send_wc_common_put(cq, wc, "write"); - - ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 0); -} - -/** * svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC * @cq: completion queue * @wc: completed WR @@ -561,14 +464,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); INIT_LIST_HEAD(&cma_xprt->sc_ctxts); - INIT_LIST_HEAD(&cma_xprt->sc_maps); + INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); spin_lock_init(&cma_xprt->sc_frmr_q_lock); spin_lock_init(&cma_xprt->sc_ctxt_lock); - spin_lock_init(&cma_xprt->sc_map_lock); + spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); /* * Note that this implies that the underlying transport support @@ -999,6 +902,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt, newxprt->sc_cm_id); dev = newxprt->sc_cm_id->device; + newxprt->sc_port_num = newxprt->sc_cm_id->port_num; /* Qualify the transport resource defaults with the * capabilities of this particular device */ @@ -1014,13 +918,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) svcrdma_max_bc_requests); newxprt->sc_rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests; - newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth; + newxprt->sc_sq_depth = newxprt->sc_rq_depth; atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); if (!svc_rdma_prealloc_ctxts(newxprt)) goto errout; - if (!svc_rdma_prealloc_maps(newxprt)) - goto errout; /* * Limit ORD based on client limit, local device limit, and @@ -1050,6 +952,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = &newxprt->sc_xprt; + qp_attr.port_num = newxprt->sc_cm_id->port_num; + qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests; qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth; qp_attr.cap.max_send_sge = newxprt->sc_max_sge; @@ -1248,8 +1152,8 @@ static void __svc_rdma_free(struct work_struct *work) } rdma_dealloc_frmr_q(rdma); + svc_rdma_destroy_rw_ctxts(rdma); svc_rdma_destroy_ctxts(rdma); - svc_rdma_destroy_maps(rdma); /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index c717f5410776..62ecbccd9748 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -66,8 +66,8 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; -static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; - int xprt_rdma_pad_optimize = 0; +unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; +int xprt_rdma_pad_optimize; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -396,7 +396,7 @@ xprt_setup_rdma(struct xprt_create *args) new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy); + rc = rpcrdma_ia_open(new_xprt, sap); if (rc) goto out1; @@ -457,19 +457,33 @@ out1: return ERR_PTR(rc); } -/* - * Close a connection, during shutdown or timeout/reconnect +/** + * xprt_rdma_close - Close down RDMA connection + * @xprt: generic transport to be closed + * + * Called during transport shutdown reconnect, or device + * removal. Caller holds the transport's write lock. */ static void xprt_rdma_close(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + dprintk("RPC: %s: closing xprt %p\n", __func__, xprt); - dprintk("RPC: %s: closing\n", __func__); - if (r_xprt->rx_ep.rep_connected > 0) + if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) { + xprt_clear_connected(xprt); + rpcrdma_ia_remove(ia); + return; + } + if (ep->rep_connected == -ENODEV) + return; + if (ep->rep_connected > 0) xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); - rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_ep_disconnect(ep, ia); } static void @@ -484,6 +498,27 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) dprintk("RPC: %s: %u\n", __func__, port); } +/** + * xprt_rdma_timer - invoked when an RPC times out + * @xprt: controlling RPC transport + * @task: RPC task that timed out + * + * Invoked when the transport is still connected, but an RPC + * retransmit timeout occurs. + * + * Since RDMA connections don't have a keep-alive, forcibly + * disconnect and retry to connect. This drives full + * detection of the network path, and retransmissions of + * all pending RPCs. + */ +static void +xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) +{ + dprintk("RPC: %5u %s: xprt = %p\n", task->tk_pid, __func__, xprt); + + xprt_force_disconnect(xprt); +} + static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { @@ -659,6 +694,8 @@ xprt_rdma_free(struct rpc_task *task) * xprt_rdma_send_request - marshal and send an RPC request * @task: RPC task with an RPC message in rq_snd_buf * + * Caller holds the transport's write lock. + * * Return values: * 0: The request has been sent * ENOTCONN: Caller needs to invoke connect logic then call again @@ -685,6 +722,9 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; + if (!xprt_connected(xprt)) + goto drop_connection; + /* On retransmit, remove any previously registered chunks */ if (unlikely(!list_empty(&req->rl_registered))) r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); @@ -776,6 +816,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = { .alloc_slot = xprt_alloc_slot, .release_request = xprt_release_rqst_cong, /* ditto */ .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ + .timer = xprt_rdma_timer, .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ .set_port = xprt_rdma_set_port, .connect = xprt_rdma_connect, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3b332b395045..3dbce9ac4327 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -53,7 +53,7 @@ #include <linux/sunrpc/addr.h> #include <linux/sunrpc/svc_rdma.h> #include <asm/bitops.h> -#include <linux/module.h> /* try_module_get()/module_put() */ + #include <rdma/ib_cm.h> #include "xprt_rdma.h" @@ -69,8 +69,11 @@ /* * internal functions */ +static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf); +static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); -static struct workqueue_struct *rpcrdma_receive_wq; +static struct workqueue_struct *rpcrdma_receive_wq __read_mostly; int rpcrdma_alloc_wq(void) @@ -180,7 +183,7 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rep->rr_wc_flags = wc->wc_flags; rep->rr_inv_rkey = wc->ex.invalidate_rkey; - ib_dma_sync_single_for_cpu(rep->rr_device, + ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf), rdmab_addr(rep->rr_rdmabuf), rep->rr_len, DMA_FROM_DEVICE); @@ -262,6 +265,21 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) __func__, ep); complete(&ia->ri_done); break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) + pr_info("rpcrdma: removing device for %pIS:%u\n", + sap, rpc_get_port(sap)); +#endif + set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); + ep->rep_connected = -ENODEV; + xprt_force_disconnect(&xprt->rx_xprt); + wait_for_completion(&ia->ri_remove_done); + + ia->ri_id = NULL; + ia->ri_pd = NULL; + ia->ri_device = NULL; + /* Return 1 to ensure the core destroys the id. */ + return 1; case RDMA_CM_EVENT_ESTABLISHED: connstate = 1; ib_query_qp(ia->ri_id->qp, attr, @@ -291,9 +309,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) goto connected; case RDMA_CM_EVENT_DISCONNECTED: connstate = -ECONNABORTED; - goto connected; - case RDMA_CM_EVENT_DEVICE_REMOVAL: - connstate = -ENODEV; connected: dprintk("RPC: %s: %sconnected\n", __func__, connstate > 0 ? "" : "dis"); @@ -329,14 +344,6 @@ connected: return 0; } -static void rpcrdma_destroy_id(struct rdma_cm_id *id) -{ - if (id) { - module_put(id->device->owner); - rdma_destroy_id(id); - } -} - static struct rdma_cm_id * rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia, struct sockaddr *addr) @@ -346,6 +353,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, int rc; init_completion(&ia->ri_done); + init_completion(&ia->ri_remove_done); id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC); @@ -370,16 +378,6 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, goto out; } - /* FIXME: - * Until xprtrdma supports DEVICE_REMOVAL, the provider must - * be pinned while there are active NFS/RDMA mounts to prevent - * hangs and crashes at umount time. - */ - if (!ia->ri_async_rc && !try_module_get(id->device->owner)) { - dprintk("RPC: %s: Failed to get device module\n", - __func__); - ia->ri_async_rc = -ENODEV; - } rc = ia->ri_async_rc; if (rc) goto out; @@ -389,21 +387,20 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, if (rc) { dprintk("RPC: %s: rdma_resolve_route() failed %i\n", __func__, rc); - goto put; + goto out; } rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); if (rc < 0) { dprintk("RPC: %s: wait() exited: %i\n", __func__, rc); - goto put; + goto out; } rc = ia->ri_async_rc; if (rc) - goto put; + goto out; return id; -put: - module_put(id->device->owner); + out: rdma_destroy_id(id); return ERR_PTR(rc); @@ -413,13 +410,16 @@ out: * Exported functions. */ -/* - * Open and initialize an Interface Adapter. - * o initializes fields of struct rpcrdma_ia, including - * interface and provider attributes and protection zone. +/** + * rpcrdma_ia_open - Open and initialize an Interface Adapter. + * @xprt: controlling transport + * @addr: IP address of remote peer + * + * Returns 0 on success, negative errno if an appropriate + * Interface Adapter could not be found and opened. */ int -rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) +rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr) { struct rpcrdma_ia *ia = &xprt->rx_ia; int rc; @@ -427,7 +427,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); - goto out1; + goto out_err; } ia->ri_device = ia->ri_id->device; @@ -435,10 +435,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (IS_ERR(ia->ri_pd)) { rc = PTR_ERR(ia->ri_pd); pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); - goto out2; + goto out_err; } - switch (memreg) { + switch (xprt_rdma_memreg_strategy) { case RPCRDMA_FRMR: if (frwr_is_supported(ia)) { ia->ri_ops = &rpcrdma_frwr_memreg_ops; @@ -452,28 +452,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) } /*FALLTHROUGH*/ default: - pr_err("rpcrdma: Unsupported memory registration mode: %d\n", - memreg); + pr_err("rpcrdma: Device %s does not support memreg mode %d\n", + ia->ri_device->name, xprt_rdma_memreg_strategy); rc = -EINVAL; - goto out3; + goto out_err; } return 0; -out3: - ib_dealloc_pd(ia->ri_pd); - ia->ri_pd = NULL; -out2: - rpcrdma_destroy_id(ia->ri_id); - ia->ri_id = NULL; -out1: +out_err: + rpcrdma_ia_close(ia); return rc; } -/* - * Clean up/close an IA. - * o if event handles and PD have been initialized, free them. - * o close the IA +/** + * rpcrdma_ia_remove - Handle device driver unload + * @ia: interface adapter being removed + * + * Divest transport H/W resources associated with this adapter, + * but allow it to be restored later. + */ +void +rpcrdma_ia_remove(struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, + rx_ia); + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_req *req; + struct rpcrdma_rep *rep; + + cancel_delayed_work_sync(&buf->rb_refresh_worker); + + /* This is similar to rpcrdma_ep_destroy, but: + * - Don't cancel the connect worker. + * - Don't call rpcrdma_ep_disconnect, which waits + * for another conn upcall, which will deadlock. + * - rdma_disconnect is unneeded, the underlying + * connection is already gone. + */ + if (ia->ri_id->qp) { + ib_drain_qp(ia->ri_id->qp); + rdma_destroy_qp(ia->ri_id); + ia->ri_id->qp = NULL; + } + ib_free_cq(ep->rep_attr.recv_cq); + ib_free_cq(ep->rep_attr.send_cq); + + /* The ULP is responsible for ensuring all DMA + * mappings and MRs are gone. + */ + list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list) + rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf); + list_for_each_entry(req, &buf->rb_allreqs, rl_all) { + rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf); + rpcrdma_dma_unmap_regbuf(req->rl_sendbuf); + rpcrdma_dma_unmap_regbuf(req->rl_recvbuf); + } + rpcrdma_destroy_mrs(buf); + + /* Allow waiters to continue */ + complete(&ia->ri_remove_done); +} + +/** + * rpcrdma_ia_close - Clean up/close an IA. + * @ia: interface adapter to close + * */ void rpcrdma_ia_close(struct rpcrdma_ia *ia) @@ -482,13 +527,15 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { if (ia->ri_id->qp) rdma_destroy_qp(ia->ri_id); - rpcrdma_destroy_id(ia->ri_id); - ia->ri_id = NULL; + rdma_destroy_id(ia->ri_id); } + ia->ri_id = NULL; + ia->ri_device = NULL; /* If the pd is still busy, xprtrdma missed freeing a resource */ if (ia->ri_pd && !IS_ERR(ia->ri_pd)) ib_dealloc_pd(ia->ri_pd); + ia->ri_pd = NULL; } /* @@ -646,6 +693,99 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ib_free_cq(ep->rep_attr.send_cq); } +/* Re-establish a connection after a device removal event. + * Unlike a normal reconnection, a fresh PD and a new set + * of MRs and buffers is needed. + */ +static int +rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr; + int rc, err; + + pr_info("%s: r_xprt = %p\n", __func__, r_xprt); + + rc = -EHOSTUNREACH; + if (rpcrdma_ia_open(r_xprt, sap)) + goto out1; + + rc = -ENOMEM; + err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data); + if (err) { + pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err); + goto out2; + } + + rc = -ENETUNREACH; + err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + if (err) { + pr_err("rpcrdma: rdma_create_qp returned %d\n", err); + goto out3; + } + + rpcrdma_create_mrs(r_xprt); + return 0; + +out3: + rpcrdma_ep_destroy(ep, ia); +out2: + rpcrdma_ia_close(ia); +out1: + return rc; +} + +static int +rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, + struct rpcrdma_ia *ia) +{ + struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr; + struct rdma_cm_id *id, *old; + int err, rc; + + dprintk("RPC: %s: reconnecting...\n", __func__); + + rpcrdma_ep_disconnect(ep, ia); + + rc = -EHOSTUNREACH; + id = rpcrdma_create_id(r_xprt, ia, sap); + if (IS_ERR(id)) + goto out; + + /* As long as the new ID points to the same device as the + * old ID, we can reuse the transport's existing PD and all + * previously allocated MRs. Also, the same device means + * the transport's previous DMA mappings are still valid. + * + * This is a sanity check only. There should be no way these + * point to two different devices here. + */ + old = id; + rc = -ENETUNREACH; + if (ia->ri_device != id->device) { + pr_err("rpcrdma: can't reconnect on different device!\n"); + goto out_destroy; + } + + err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + if (err) { + dprintk("RPC: %s: rdma_create_qp returned %d\n", + __func__, err); + goto out_destroy; + } + + /* Atomically replace the transport's ID and QP. */ + rc = 0; + old = ia->ri_id; + ia->ri_id = id; + rdma_destroy_qp(old); + +out_destroy: + rdma_destroy_id(old); +out: + return rc; +} + /* * Connect unconnected endpoint. */ @@ -654,61 +794,30 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rdma_cm_id *id, *old; - struct sockaddr *sap; unsigned int extras; - int rc = 0; + int rc; - if (ep->rep_connected != 0) { retry: - dprintk("RPC: %s: reconnecting...\n", __func__); - - rpcrdma_ep_disconnect(ep, ia); - - sap = (struct sockaddr *)&r_xprt->rx_data.addr; - id = rpcrdma_create_id(r_xprt, ia, sap); - if (IS_ERR(id)) { - rc = -EHOSTUNREACH; - goto out; - } - /* TEMP TEMP TEMP - fail if new device: - * Deregister/remarshal *all* requests! - * Close and recreate adapter, pd, etc! - * Re-determine all attributes still sane! - * More stuff I haven't thought of! - * Rrrgh! - */ - if (ia->ri_device != id->device) { - printk("RPC: %s: can't reconnect on " - "different device!\n", __func__); - rpcrdma_destroy_id(id); - rc = -ENETUNREACH; - goto out; - } - /* END TEMP */ - rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); - if (rc) { - dprintk("RPC: %s: rdma_create_qp failed %i\n", - __func__, rc); - rpcrdma_destroy_id(id); - rc = -ENETUNREACH; - goto out; - } - - old = ia->ri_id; - ia->ri_id = id; - - rdma_destroy_qp(old); - rpcrdma_destroy_id(old); - } else { + switch (ep->rep_connected) { + case 0: dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); if (rc) { dprintk("RPC: %s: rdma_create_qp failed %i\n", __func__, rc); - /* do not update ep->rep_connected */ - return -ENETUNREACH; + rc = -ENETUNREACH; + goto out_noupdate; } + break; + case -ENODEV: + rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia); + if (rc) + goto out_noupdate; + break; + default: + rc = rpcrdma_ep_reconnect(r_xprt, ep, ia); + if (rc) + goto out; } ep->rep_connected = 0; @@ -736,6 +845,8 @@ retry: out: if (rc) ep->rep_connected = rc; + +out_noupdate: return rc; } @@ -878,7 +989,6 @@ struct rpcrdma_rep * rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_rep *rep; int rc; @@ -894,7 +1004,6 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) goto out_free; } - rep->rr_device = ia->ri_device; rep->rr_cqe.done = rpcrdma_wc_receive; rep->rr_rxprt = r_xprt; INIT_WORK(&rep->rr_work, rpcrdma_reply_handler); @@ -1037,6 +1146,7 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { cancel_delayed_work_sync(&buf->rb_recovery_worker); + cancel_delayed_work_sync(&buf->rb_refresh_worker); while (!list_empty(&buf->rb_recv_bufs)) { struct rpcrdma_rep *rep; @@ -1081,7 +1191,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) out_nomws: dprintk("RPC: %s: no MWs available\n", __func__); - schedule_delayed_work(&buf->rb_refresh_worker, 0); + if (r_xprt->rx_ep.rep_connected != -ENODEV) + schedule_delayed_work(&buf->rb_refresh_worker, 0); /* Allow the reply handler and refresh worker to run */ cond_resched(); @@ -1231,17 +1342,19 @@ rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction, bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) { + struct ib_device *device = ia->ri_device; + if (rb->rg_direction == DMA_NONE) return false; - rb->rg_iov.addr = ib_dma_map_single(ia->ri_device, + rb->rg_iov.addr = ib_dma_map_single(device, (void *)rb->rg_base, rdmab_length(rb), rb->rg_direction); - if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb))) + if (ib_dma_mapping_error(device, rdmab_addr(rb))) return false; - rb->rg_device = ia->ri_device; + rb->rg_device = device; rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey; return true; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 171a35116de9..1d66acf1a723 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -69,6 +69,7 @@ struct rpcrdma_ia { struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; struct completion ri_done; + struct completion ri_remove_done; int ri_async_rc; unsigned int ri_max_segs; unsigned int ri_max_frmr_depth; @@ -78,10 +79,15 @@ struct rpcrdma_ia { bool ri_reminv_expected; bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; + unsigned long ri_flags; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; }; +enum { + RPCRDMA_IAF_REMOVING = 0, +}; + /* * RDMA Endpoint -- one per transport instance */ @@ -164,6 +170,12 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } +static inline struct ib_device * +rdmab_device(struct rpcrdma_regbuf *rb) +{ + return rb->rg_device; +} + #define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) /* To ensure a transport can always make forward progress, @@ -209,7 +221,6 @@ struct rpcrdma_rep { unsigned int rr_len; int rr_wc_flags; u32 rr_inv_rkey; - struct ib_device *rr_device; struct rpcrdma_xprt *rr_rxprt; struct work_struct rr_work; struct list_head rr_list; @@ -380,7 +391,6 @@ struct rpcrdma_buffer { spinlock_t rb_mwlock; /* protect rb_mws list */ struct list_head rb_mws; struct list_head rb_all; - char *rb_pool; spinlock_t rb_lock; /* protect buf lists */ int rb_send_count, rb_recv_count; @@ -497,10 +507,16 @@ struct rpcrdma_xprt { * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ extern int xprt_rdma_pad_optimize; +/* This setting controls the hunt for a supported memory + * registration strategy. + */ +extern unsigned int xprt_rdma_memreg_strategy; + /* * Interface Adapter calls - xprtrdma/verbs.c */ -int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); +int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr); +void rpcrdma_ia_remove(struct rpcrdma_ia *ia); void rpcrdma_ia_close(struct rpcrdma_ia *); bool frwr_is_supported(struct rpcrdma_ia *); bool fmr_is_supported(struct rpcrdma_ia *); diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 919981324171..9aed6fe1bf1a 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -106,7 +106,6 @@ __init int net_sysctl_init(void) ret = register_pernet_subsys(&sysctl_pernet_ops); if (ret) goto out1; - register_sysctl_root(&net_sysctl_root); out: return ret; out1: diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 0d4f2f455a7c..1b92b72e812f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -362,25 +362,25 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout) return 0; } -#define tipc_wait_for_cond(sock_, timeout_, condition_) \ -({ \ - int rc_ = 0; \ - int done_ = 0; \ - \ - while (!(condition_) && !done_) { \ - struct sock *sk_ = sock->sk; \ - DEFINE_WAIT_FUNC(wait_, woken_wake_function); \ - \ - rc_ = tipc_sk_sock_err(sock_, timeout_); \ - if (rc_) \ - break; \ - prepare_to_wait(sk_sleep(sk_), &wait_, \ - TASK_INTERRUPTIBLE); \ - done_ = sk_wait_event(sk_, timeout_, \ - (condition_), &wait_); \ - remove_wait_queue(sk_sleep(sk_), &wait_); \ - } \ - rc_; \ +#define tipc_wait_for_cond(sock_, timeo_, condition_) \ +({ \ + struct sock *sk_; \ + int rc_; \ + \ + while ((rc_ = !(condition_))) { \ + DEFINE_WAIT_FUNC(wait_, woken_wake_function); \ + sk_ = (sock_)->sk; \ + rc_ = tipc_sk_sock_err((sock_), timeo_); \ + if (rc_) \ + break; \ + prepare_to_wait(sk_sleep(sk_), &wait_, TASK_INTERRUPTIBLE); \ + release_sock(sk_); \ + *(timeo_) = wait_woken(&wait_, TASK_INTERRUPTIBLE, *(timeo_)); \ + sched_annotate_sleep(); \ + lock_sock(sk_); \ + remove_wait_queue(sk_sleep(sk_), &wait_); \ + } \ + rc_; \ }) /** diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 6f7f6757ceef..dfc8c51e4d74 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1540,8 +1540,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, long timeout; int err; struct vsock_transport_send_notify_data send_data; - - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); sk = sock->sk; vsk = vsock_sk(sk); @@ -1584,11 +1583,10 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) goto out; - while (total_written < len) { ssize_t written; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); while (vsock_stream_has_space(vsk) == 0 && sk->sk_err == 0 && !(sk->sk_shutdown & SEND_SHUTDOWN) && @@ -1597,33 +1595,30 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } err = transport->notify_send_pre_block(vsk, &send_data); if (err < 0) { - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } release_sock(sk); - timeout = schedule_timeout(timeout); + timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } else if (timeout == 0) { err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } - - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); /* These checks occur both as part of and after the loop * conditional since we need to check before and after diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 9dffe0282ad4..403d86e80162 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -576,9 +576,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev) vsock->vdev = vdev; - ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX, - vsock->vqs, callbacks, names, - NULL); + ret = virtio_find_vqs(vsock->vdev, VSOCK_VQ_MAX, + vsock->vqs, callbacks, names, + NULL); if (ret < 0) goto out; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 8b911c29860e..5a1a98df3499 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1791,32 +1791,40 @@ void x25_kill_by_neigh(struct x25_neigh *nb) static int __init x25_init(void) { - int rc = proto_register(&x25_proto, 0); + int rc; - if (rc != 0) + rc = proto_register(&x25_proto, 0); + if (rc) goto out; rc = sock_register(&x25_family_ops); - if (rc != 0) + if (rc) goto out_proto; dev_add_pack(&x25_packet_type); rc = register_netdevice_notifier(&x25_dev_notifier); - if (rc != 0) + if (rc) goto out_sock; - pr_info("Linux Version 0.2\n"); + rc = x25_register_sysctl(); + if (rc) + goto out_dev; - x25_register_sysctl(); rc = x25_proc_init(); - if (rc != 0) - goto out_dev; + if (rc) + goto out_sysctl; + + pr_info("Linux Version 0.2\n"); + out: return rc; +out_sysctl: + x25_unregister_sysctl(); out_dev: unregister_netdevice_notifier(&x25_dev_notifier); out_sock: + dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); out_proto: proto_unregister(&x25_proto); diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c index a06dfe143c67..ba078c85f0a1 100644 --- a/net/x25/sysctl_net_x25.c +++ b/net/x25/sysctl_net_x25.c @@ -73,9 +73,12 @@ static struct ctl_table x25_table[] = { { }, }; -void __init x25_register_sysctl(void) +int __init x25_register_sysctl(void) { x25_table_header = register_net_sysctl(&init_net, "net/x25", x25_table); + if (!x25_table_header) + return -ENOMEM; + return 0; } void x25_unregister_sysctl(void) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 8ec8a3fcf8d4..574e6f32f94f 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -170,7 +170,7 @@ static int xfrm_dev_feat_change(struct net_device *dev) static int xfrm_dev_down(struct net_device *dev) { - if (dev->hw_features & NETIF_F_HW_ESP) + if (dev->features & NETIF_F_HW_ESP) xfrm_dev_state_flush(dev_net(dev), dev, true); xfrm_garbage_collect(dev_net(dev)); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b00a1d5a7f52..ed4e52d95172 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1797,43 +1797,6 @@ free_dst: goto out; } -#ifdef CONFIG_XFRM_SUB_POLICY -static int xfrm_dst_alloc_copy(void **target, const void *src, int size) -{ - if (!*target) { - *target = kmalloc(size, GFP_ATOMIC); - if (!*target) - return -ENOMEM; - } - - memcpy(*target, src, size); - return 0; -} -#endif - -static int xfrm_dst_update_parent(struct dst_entry *dst, - const struct xfrm_selector *sel) -{ -#ifdef CONFIG_XFRM_SUB_POLICY - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - return xfrm_dst_alloc_copy((void **)&(xdst->partner), - sel, sizeof(*sel)); -#else - return 0; -#endif -} - -static int xfrm_dst_update_origin(struct dst_entry *dst, - const struct flowi *fl) -{ -#ifdef CONFIG_XFRM_SUB_POLICY - struct xfrm_dst *xdst = (struct xfrm_dst *)dst; - return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl)); -#else - return 0; -#endif -} - static int xfrm_expand_policies(const struct flowi *fl, u16 family, struct xfrm_policy **pols, int *num_pols, int *num_xfrms) @@ -1905,16 +1868,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, xdst = (struct xfrm_dst *)dst; xdst->num_xfrms = err; - if (num_pols > 1) - err = xfrm_dst_update_parent(dst, &pols[1]->selector); - else - err = xfrm_dst_update_origin(dst, fl); - if (unlikely(err)) { - dst_free(dst); - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); - return ERR_PTR(err); - } - xdst->num_pols = num_pols; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index fc3c5aa38754..2e291bc5f1fc 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1383,6 +1383,8 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig) x->curlft.add_time = orig->curlft.add_time; x->km.state = orig->km.state; x->km.seq = orig->km.seq; + x->replay = orig->replay; + x->preplay = orig->preplay; return x; |