diff options
Diffstat (limited to 'net')
83 files changed, 1309 insertions, 1007 deletions
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index e2ed69850489..0bc31de9071a 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -21,6 +21,12 @@ bool vlan_do_receive(struct sk_buff **skbp) if (unlikely(!skb)) return false; + if (unlikely(!(vlan_dev->flags & IFF_UP))) { + kfree_skb(skb); + *skbp = NULL; + return false; + } + skb->dev = vlan_dev; if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { /* Our lower layer thinks this is not local, let's make sure. diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index ddfa86648f95..903a190319b9 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -272,6 +272,7 @@ static int p9_fd_read(struct p9_client *client, void *v, int len) { int ret; struct p9_trans_fd *ts = NULL; + loff_t pos; if (client && client->status != Disconnected) ts = client->trans; @@ -282,7 +283,8 @@ static int p9_fd_read(struct p9_client *client, void *v, int len) if (!(ts->rd->f_flags & O_NONBLOCK)) p9_debug(P9_DEBUG_ERROR, "blocking read ...\n"); - ret = kernel_read(ts->rd, ts->rd->f_pos, v, len); + pos = ts->rd->f_pos; + ret = kernel_read(ts->rd, v, len, &pos); if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN) client->status = Disconnected; return ret; @@ -420,8 +422,7 @@ error: static int p9_fd_write(struct p9_client *client, void *v, int len) { - int ret; - mm_segment_t oldfs; + ssize_t ret; struct p9_trans_fd *ts = NULL; if (client && client->status != Disconnected) @@ -433,12 +434,7 @@ static int p9_fd_write(struct p9_client *client, void *v, int len) if (!(ts->wr->f_flags & O_NONBLOCK)) p9_debug(P9_DEBUG_ERROR, "blocking write ...\n"); - oldfs = get_fs(); - set_fs(get_ds()); - /* The cast to a user pointer is valid due to the set_fs() */ - ret = vfs_write(ts->wr, (__force void __user *)v, len, &ts->wr->f_pos); - set_fs(oldfs); - + ret = kernel_write(ts->wr, v, len, &ts->wr->f_pos); if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN) client->status = Disconnected; return ret; diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index c18115d22f00..db82a40875e8 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -126,14 +126,4 @@ config BT_DEBUGFS Provide extensive information about internal Bluetooth states in debugfs. -config BT_LEGACY_IOCTL - bool "Enable legacy ioctl interfaces" - depends on BT && BT_BREDR - default y - help - Enable support for legacy ioctl interfaces. This is only needed - for old and deprecated applications using direct ioctl calls for - controller management. Since Linux 3.4 all configuration and - setup is done via mgmt interface and this is no longer needed. - source "drivers/bluetooth/Kconfig" diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 0bad296fe0af..65d734c165bd 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -878,7 +878,6 @@ static int hci_sock_release(struct socket *sock) return 0; } -#ifdef CONFIG_BT_LEGACY_IOCTL static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; @@ -1050,7 +1049,6 @@ done: release_sock(sk); return err; } -#endif static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) @@ -1971,11 +1969,7 @@ static const struct proto_ops hci_sock_ops = { .getname = hci_sock_getname, .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, -#ifdef CONFIG_BT_LEGACY_IOCTL .ioctl = hci_sock_ioctl, -#else - .ioctl = sock_no_ioctl, -#endif .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 303c779bfe38..43ba91c440bc 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -58,7 +58,7 @@ static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, u8 code, u8 ident, u16 dlen, void *data); static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, void *data); -static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data); +static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size); static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err); static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control, @@ -1473,7 +1473,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn) set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -2987,12 +2987,15 @@ static inline int l2cap_get_conf_opt(void **ptr, int *type, int *olen, return len; } -static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val) +static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val, size_t size) { struct l2cap_conf_opt *opt = *ptr; BT_DBG("type 0x%2.2x len %u val 0x%lx", type, len, val); + if (size < L2CAP_CONF_OPT_SIZE + len) + return; + opt->type = type; opt->len = len; @@ -3017,7 +3020,7 @@ static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val) *ptr += L2CAP_CONF_OPT_SIZE + len; } -static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan) +static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan, size_t size) { struct l2cap_conf_efs efs; @@ -3045,7 +3048,7 @@ static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan) } l2cap_add_conf_opt(ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, size); } static void l2cap_ack_timeout(struct work_struct *work) @@ -3191,11 +3194,12 @@ static inline void l2cap_txwin_setup(struct l2cap_chan *chan) chan->ack_win = chan->tx_win; } -static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data) +static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size) { struct l2cap_conf_req *req = data; struct l2cap_conf_rfc rfc = { .mode = chan->mode }; void *ptr = req->data; + void *endptr = data + data_size; u16 size; BT_DBG("chan %p", chan); @@ -3220,7 +3224,7 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data) done: if (chan->imtu != L2CAP_DEFAULT_MTU) - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr); switch (chan->mode) { case L2CAP_MODE_BASIC: @@ -3239,7 +3243,7 @@ done: rfc.max_pdu_size = 0; l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); break; case L2CAP_MODE_ERTM: @@ -3259,21 +3263,21 @@ done: L2CAP_DEFAULT_TX_WINDOW); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) - l2cap_add_opt_efs(&ptr, chan); + l2cap_add_opt_efs(&ptr, chan, endptr - ptr); if (test_bit(FLAG_EXT_CTRL, &chan->flags)) l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2, - chan->tx_win); + chan->tx_win, endptr - ptr); if (chan->conn->feat_mask & L2CAP_FEAT_FCS) if (chan->fcs == L2CAP_FCS_NONE || test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) { chan->fcs = L2CAP_FCS_NONE; l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1, - chan->fcs); + chan->fcs, endptr - ptr); } break; @@ -3291,17 +3295,17 @@ done: rfc.max_pdu_size = cpu_to_le16(size); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) - l2cap_add_opt_efs(&ptr, chan); + l2cap_add_opt_efs(&ptr, chan, endptr - ptr); if (chan->conn->feat_mask & L2CAP_FEAT_FCS) if (chan->fcs == L2CAP_FCS_NONE || test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) { chan->fcs = L2CAP_FCS_NONE; l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1, - chan->fcs); + chan->fcs, endptr - ptr); } break; } @@ -3312,10 +3316,11 @@ done: return ptr - data; } -static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data) +static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data_size) { struct l2cap_conf_rsp *rsp = data; void *ptr = rsp->data; + void *endptr = data + data_size; void *req = chan->conf_req; int len = chan->conf_len; int type, hint, olen; @@ -3417,7 +3422,7 @@ done: return -ECONNREFUSED; l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); } if (result == L2CAP_CONF_SUCCESS) { @@ -3430,7 +3435,7 @@ done: chan->omtu = mtu; set_bit(CONF_MTU_DONE, &chan->conf_state); } - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu, endptr - ptr); if (remote_efs) { if (chan->local_stype != L2CAP_SERV_NOTRAFIC && @@ -3444,7 +3449,7 @@ done: l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, endptr - ptr); } else { /* Send PENDING Conf Rsp */ result = L2CAP_CONF_PENDING; @@ -3477,7 +3482,7 @@ done: set_bit(CONF_MODE_DONE, &chan->conf_state); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, - sizeof(rfc), (unsigned long) &rfc); + sizeof(rfc), (unsigned long) &rfc, endptr - ptr); if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) { chan->remote_id = efs.id; @@ -3491,7 +3496,7 @@ done: le32_to_cpu(efs.sdu_itime); l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, endptr - ptr); } break; @@ -3505,7 +3510,7 @@ done: set_bit(CONF_MODE_DONE, &chan->conf_state); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); break; @@ -3527,10 +3532,11 @@ done: } static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, - void *data, u16 *result) + void *data, size_t size, u16 *result) { struct l2cap_conf_req *req = data; void *ptr = req->data; + void *endptr = data + size; int type, olen; unsigned long val; struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC }; @@ -3548,13 +3554,13 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, chan->imtu = L2CAP_DEFAULT_MIN_MTU; } else chan->imtu = val; - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr); break; case L2CAP_CONF_FLUSH_TO: chan->flush_to = val; l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO, - 2, chan->flush_to); + 2, chan->flush_to, endptr - ptr); break; case L2CAP_CONF_RFC: @@ -3568,13 +3574,13 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, chan->fcs = 0; l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, - sizeof(rfc), (unsigned long) &rfc); + sizeof(rfc), (unsigned long) &rfc, endptr - ptr); break; case L2CAP_CONF_EWS: chan->ack_win = min_t(u16, val, chan->ack_win); l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2, - chan->tx_win); + chan->tx_win, endptr - ptr); break; case L2CAP_CONF_EFS: @@ -3587,7 +3593,7 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, return -ECONNREFUSED; l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, endptr - ptr); break; case L2CAP_CONF_FCS: @@ -3692,7 +3698,7 @@ void __l2cap_connect_rsp_defer(struct l2cap_chan *chan) return; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -3900,7 +3906,7 @@ sendresp: u8 buf[128]; set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -3978,7 +3984,7 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, break; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, req), req); + l2cap_build_conf_req(chan, req, sizeof(req)), req); chan->num_conf_req++; break; @@ -4090,7 +4096,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, } /* Complete config. */ - len = l2cap_parse_conf_req(chan, rsp); + len = l2cap_parse_conf_req(chan, rsp, sizeof(rsp)); if (len < 0) { l2cap_send_disconn_req(chan, ECONNRESET); goto unlock; @@ -4124,7 +4130,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, if (!test_and_set_bit(CONF_REQ_SENT, &chan->conf_state)) { u8 buf[64]; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -4184,7 +4190,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, char buf[64]; len = l2cap_parse_conf_rsp(chan, rsp->data, len, - buf, &result); + buf, sizeof(buf), &result); if (len < 0) { l2cap_send_disconn_req(chan, ECONNRESET); goto done; @@ -4214,7 +4220,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, /* throw out any old stored conf requests */ result = L2CAP_CONF_SUCCESS; len = l2cap_parse_conf_rsp(chan, rsp->data, len, - req, &result); + req, sizeof(req), &result); if (len < 0) { l2cap_send_disconn_req(chan, ECONNRESET); goto done; @@ -4791,7 +4797,7 @@ static void l2cap_do_create(struct l2cap_chan *chan, int result, set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(chan->conn, l2cap_get_ident(chan->conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } } @@ -7465,7 +7471,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 875675765531..63edc6e5f026 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -676,7 +676,8 @@ bad: /* * Do a synchronous statfs(). */ -int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) +int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool, + struct ceph_statfs *buf) { struct ceph_mon_generic_request *req; struct ceph_mon_statfs *h; @@ -696,6 +697,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) goto out; req->u.st = buf; + req->request->hdr.version = cpu_to_le16(2); mutex_lock(&monc->mutex); register_generic_request(req); @@ -705,6 +707,8 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) h->monhdr.session_mon = cpu_to_le16(-1); h->monhdr.session_mon_tid = 0; h->fsid = monc->monmap->fsid; + h->contains_data_pool = (data_pool != CEPH_NOPOOL); + h->data_pool = cpu_to_le64(data_pool); send_generic_request(monc, req); mutex_unlock(&monc->mutex); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index dcfbdd74dfd1..e02f01f534e2 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -863,8 +863,6 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, dst->cls.method_len = src->cls.method_len; dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); break; - case CEPH_OSD_OP_STARTSYNC: - break; case CEPH_OSD_OP_WATCH: dst->watch.cookie = cpu_to_le64(src->watch.cookie); dst->watch.ver = cpu_to_le64(0); @@ -916,9 +914,6 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, * if the file was recently truncated, we include information about its * old and new size so that the object can be updated appropriately. (we * avoid synchronously deleting truncated objects because it's slow.) - * - * if @do_sync, include a 'startsync' command so that the osd will flush - * data quickly. */ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index f358d0bfa76b..79d14d70b7ea 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -2445,19 +2445,34 @@ static void apply_upmap(struct ceph_osdmap *osdmap, pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid); if (pg) { - for (i = 0; i < raw->size; i++) { - for (j = 0; j < pg->pg_upmap_items.len; j++) { - int from = pg->pg_upmap_items.from_to[j][0]; - int to = pg->pg_upmap_items.from_to[j][1]; - - if (from == raw->osds[i]) { - if (!(to != CRUSH_ITEM_NONE && - to < osdmap->max_osd && - osdmap->osd_weight[to] == 0)) - raw->osds[i] = to; + /* + * Note: this approach does not allow a bidirectional swap, + * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. + */ + for (i = 0; i < pg->pg_upmap_items.len; i++) { + int from = pg->pg_upmap_items.from_to[i][0]; + int to = pg->pg_upmap_items.from_to[i][1]; + int pos = -1; + bool exists = false; + + /* make sure replacement doesn't already appear */ + for (j = 0; j < raw->size; j++) { + int osd = raw->osds[j]; + + if (osd == to) { + exists = true; break; } + /* ignore mapping if target is marked out */ + if (osd == from && pos < 0 && + !(to != CRUSH_ITEM_NONE && + to < osdmap->max_osd && + osdmap->osd_weight[to] == 0)) { + pos = j; + } } + if (!exists && pos >= 0) + raw->osds[pos] = to; } } } diff --git a/net/compat.c b/net/compat.c index 6ded6c821d7a..22381719718c 100644 --- a/net/compat.c +++ b/net/compat.c @@ -185,6 +185,13 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); } + /* + * check the length of messages copied in is the same as the + * what we get from the first loop + */ + if ((char *)kcmsg - (char *)kcmsg_base != kcmlen) + goto Einval; + /* Ok, looks like we made it. Hook it up and return success. */ kmsg->msg_control = kcmsg_base; kmsg->msg_controllen = kcmlen; diff --git a/net/core/dev.c b/net/core/dev.c index fb766d906148..588b473194a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1948,8 +1948,12 @@ again: goto again; } out_unlock: - if (pt_prev) - pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + if (pt_prev) { + if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) + pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); + else + kfree_skb(skb2); + } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); @@ -3892,6 +3896,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); + skb->mac_header += off; switch (act) { case XDP_REDIRECT: diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 6a582ae4c5d9..3228411ada0f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -525,6 +525,8 @@ convert_link_ksettings_to_legacy_settings( = link_ksettings->base.eth_tp_mdix; legacy_settings->eth_tp_mdix_ctrl = link_ksettings->base.eth_tp_mdix_ctrl; + legacy_settings->transceiver + = link_ksettings->base.transceiver; return retval; } diff --git a/net/core/filter.c b/net/core/filter.c index 3a50a9b021e2..74b8c91fb5f4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -989,10 +989,14 @@ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { - bool ret = __sk_filter_charge(sk, fp); - if (ret) - refcount_inc(&fp->refcnt); - return ret; + if (!refcount_inc_not_zero(&fp->refcnt)) + return false; + + if (!__sk_filter_charge(sk, fp)) { + sk_filter_release(fp); + return false; + } + return true; } static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) @@ -1794,7 +1798,7 @@ struct redirect_info { u32 flags; struct bpf_map *map; struct bpf_map *map_to_flush; - const struct bpf_prog *map_owner; + unsigned long map_owner; }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); @@ -2500,27 +2504,31 @@ void xdp_do_flush_map(void) } EXPORT_SYMBOL_GPL(xdp_do_flush_map); +static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, + unsigned long aux) +{ + return (unsigned long)xdp_prog->aux != aux; +} + static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); - const struct bpf_prog *map_owner = ri->map_owner; + unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; + struct net_device *fwd = NULL; u32 index = ri->ifindex; - struct net_device *fwd; int err; ri->ifindex = 0; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; - /* This is really only caused by a deliberately crappy - * BPF program, normally we would never hit that case, - * so no need to inform someone via tracepoints either, - * just bail out. - */ - if (unlikely(map_owner != xdp_prog)) - return -EINVAL; + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { + err = -EFAULT; + map = NULL; + goto err; + } fwd = __dev_map_lookup_elem(map, index); if (!fwd) { @@ -2576,13 +2584,27 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); + unsigned long map_owner = ri->map_owner; + struct bpf_map *map = ri->map; + struct net_device *fwd = NULL; u32 index = ri->ifindex; - struct net_device *fwd; unsigned int len; int err = 0; - fwd = dev_get_by_index_rcu(dev_net(dev), index); ri->ifindex = 0; + ri->map = NULL; + ri->map_owner = 0; + + if (map) { + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { + err = -EFAULT; + map = NULL; + goto err; + } + fwd = __dev_map_lookup_elem(map, index); + } else { + fwd = dev_get_by_index_rcu(dev_net(dev), index); + } if (unlikely(!fwd)) { err = -EINVAL; goto err; @@ -2600,10 +2622,12 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, } skb->dev = fwd; - _trace_xdp_redirect(dev, xdp_prog, index); + map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index) + : _trace_xdp_redirect(dev, xdp_prog, index); return 0; err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); + map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err) + : _trace_xdp_redirect_err(dev, xdp_prog, index, err); return err; } EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); @@ -2618,7 +2642,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; ri->map = NULL; - ri->map_owner = NULL; + ri->map_owner = 0; return XDP_REDIRECT; } @@ -2632,7 +2656,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { }; BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, - const struct bpf_prog *, map_owner) + unsigned long, map_owner) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 0385dece1f6f..7c1ffd6f9501 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -83,10 +83,10 @@ static void est_timer(unsigned long arg) u64 rate, brate; est_fetch_counters(est, &b); - brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log); + brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log); brate -= (est->avbps >> est->ewma_log); - rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log); + rate = (u64)(b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); rate -= (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a78fd61da0ec..d4bcdcc68e92 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3854,6 +3854,9 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, return -EMSGSIZE; ifsm = nlmsg_data(nlh); + ifsm->family = PF_UNSPEC; + ifsm->pad1 = 0; + ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; diff --git a/net/core/sock.c b/net/core/sock.c index 9b7b6bbb2a23..23953b741a41 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1654,6 +1654,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_copy(newsk, sk); + newsk->sk_prot_creator = sk->sk_prot; + /* SANITY */ if (likely(newsk->sk_net_refcnt)) get_net(sock_net(newsk)); @@ -1682,13 +1684,16 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); - filter = rcu_dereference_protected(newsk->sk_filter, 1); + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); + RCU_INIT_POINTER(newsk->sk_filter, filter); + rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2afa99506f8b..865e29e62bad 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1301,28 +1301,33 @@ int dsa_slave_create(struct dsa_port *port, const char *name) p->old_duplex = -1; port->netdev = slave_dev; - ret = register_netdev(slave_dev); - if (ret) { - netdev_err(master, "error %d registering interface %s\n", - ret, slave_dev->name); - port->netdev = NULL; - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; - } netif_carrier_off(slave_dev); ret = dsa_slave_phy_setup(p, slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); - unregister_netdev(slave_dev); - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; + goto out_free; + } + + ret = register_netdev(slave_dev); + if (ret) { + netdev_err(master, "error %d registering interface %s\n", + ret, slave_dev->name); + goto out_phy; } return 0; + +out_phy: + phy_disconnect(p->phy); + if (of_phy_is_fixed_link(p->dp->dn)) + of_phy_deregister_fixed_link(p->dp->dn); +out_free: + free_percpu(p->stats64); + free_netdev(slave_dev); + port->netdev = NULL; + return ret; } void dsa_slave_destroy(struct net_device *slave_dev) diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 416bb304a281..1859c473b21a 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -86,7 +86,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); pcsum = (__sum16 *)(greh + 1); - if (gso_partial) { + if (gso_partial && skb_is_gso(skb)) { unsigned int partial_adj; /* Adjust checksum to account for the fact that diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4089c013cb03..c039c937ba90 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -266,7 +266,7 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, #if IS_ENABLED(CONFIG_IPV6) if (tb->fast_sk_family == AF_INET6) return ipv6_rcv_saddr_equal(&tb->fast_v6_rcv_saddr, - &sk->sk_v6_rcv_saddr, + inet6_rcv_saddr(sk), tb->fast_rcv_saddr, sk->sk_rcv_saddr, tb->fast_ipv6_only, @@ -321,13 +321,14 @@ tb_found: goto fail_unlock; } success: - if (!hlist_empty(&tb->owners)) { + if (hlist_empty(&tb->owners)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif @@ -354,6 +355,7 @@ success: tb->fastuid = uid; tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); + tb->fast_sk_family = sk->sk_family; #if IS_ENABLED(CONFIG_IPV6) tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; #endif @@ -916,7 +918,6 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, tcp_sk(child)->fastopen_rsk = NULL; } inet_csk_destroy_sock(child); - reqsk_put(req); } struct sock *inet_csk_reqsk_queue_add(struct sock *sk, @@ -987,6 +988,7 @@ void inet_csk_listen_stop(struct sock *sk) sock_hold(child); inet_child_forget(sk, req, child); + reqsk_put(req); bh_unlock_sock(child); local_bh_enable(); sock_put(child); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e7eb590c86ce..b20c8ac64081 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -128,9 +128,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, break; } if (cmp == -1) - pp = &(*pp)->rb_left; + pp = &next->rb_left; else - pp = &(*pp)->rb_right; + pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0162fb955b33..467e44d7587d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -259,7 +259,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct ip_tunnel *tunnel; struct erspanhdr *ershdr; const struct iphdr *iph; - __be32 session_id; __be32 index; int len; @@ -275,8 +274,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, /* The original GRE header does not have key field, * Use ERSPAN 10-bit session ID as key. */ - session_id = cpu_to_be32(ntohs(ershdr->session_id)); - tpi->key = session_id; + tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); index = ershdr->md.index; tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, @@ -733,7 +731,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; - if (skb->len > dev->mtu) { + if (skb->len - dev->hard_header_len > dev->mtu) { pskb_trim(skb, dev->mtu); truncate = true; } @@ -1223,6 +1221,7 @@ static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } @@ -1246,13 +1245,16 @@ static int erspan_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; - t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr); + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + sizeof(struct erspanhdr); + t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; dev->mtu = ETH_DATA_LEN - t_hlen - 4; dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fa2dc8f692c6..57fc13c6ab2b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -311,9 +311,10 @@ drop: static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); - struct rtable *rt; + int (*edemux)(struct sk_buff *skb); struct net_device *dev = skb->dev; - void (*edemux)(struct sk_buff *skb); + struct rtable *rt; + int err; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { - edemux(skb); + err = edemux(skb); + if (unlikely(err)) + goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } @@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); - if (unlikely(err)) { - if (err == -EXDEV) - __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); - goto drop; - } + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, dev); + if (unlikely(err)) + goto drop_error; } #ifdef CONFIG_IP_ROUTE_CLASSID @@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) drop: kfree_skb(skb); return NET_RX_DROP; + +drop_error: + if (err == -EXDEV) + __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); + goto drop; } /* diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index e558e4f9597b..a599aa83fdad 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1207,7 +1207,6 @@ e_inval: void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); - bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags); bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) || ipv6_sk_rxinfo(sk); @@ -1221,8 +1220,13 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) * (e.g., process binds socket to eth0 for Tx which is * redirected to loopback in the rtable/dst). */ - if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX || l3slave) + struct rtable *rt = skb_rtable(skb); + bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags); + + if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX) pktinfo->ipi_ifindex = inet_iif(skb); + else if (l3slave && rt && rt->rt_iif) + pktinfo->ipi_ifindex = rt->rt_iif; pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e1856bfa753d..e9805ad664ac 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -176,7 +176,7 @@ skip_key_lookup: return cand; t = rcu_dereference(itn->collect_md_tun); - if (t) + if (t && t->dev->flags & IFF_UP) return t; if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5ed63d250950..89453cf62158 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -168,6 +168,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct ip_tunnel_parm *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ + int pkt_len = skb->len; int err; int mtu; @@ -229,7 +230,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) - err = skb->len; + err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 94d4cd2d5ea4..3d9f1c2f81c5 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1520,43 +1520,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev, EXPORT_SYMBOL(rt_dst_alloc); /* called in rcu_read_lock() section */ -static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, int our) +int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + struct in_device *in_dev, u32 *itag) { - struct rtable *rth; - struct in_device *in_dev = __in_dev_get_rcu(dev); - unsigned int flags = RTCF_MULTICAST; - u32 itag = 0; int err; /* Primary sanity checks. */ - if (!in_dev) return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || skb->protocol != htons(ETH_P_IP)) - goto e_inval; + return -EINVAL; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) - goto e_inval; + return -EINVAL; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) - goto e_inval; + return -EINVAL; } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, - in_dev, &itag); + in_dev, itag); if (err < 0) - goto e_err; + return err; } + return 0; +} + +/* called in rcu_read_lock() section */ +static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, int our) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + unsigned int flags = RTCF_MULTICAST; + struct rtable *rth; + u32 itag = 0; + int err; + + err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); + if (err) + return err; + if (our) flags |= RTCF_LOCAL; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) - goto e_nobufs; + return -ENOBUFS; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; @@ -1572,13 +1585,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb_dst_set(skb, &rth->dst); return 0; - -e_nobufs: - return -ENOBUFS; -e_inval: - return -EINVAL; -e_err: - return err; } @@ -2507,7 +2513,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or struct rtable *ort = (struct rtable *) dst_orig; struct rtable *rt; - rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); if (rt) { struct dst_entry *new = &rt->dst; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9416b5162bc..85164d4d3e53 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1503,23 +1503,23 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); -void tcp_v4_early_demux(struct sk_buff *skb) +int tcp_v4_early_demux(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) - return; + return 0; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) - return; + return 0; iph = ip_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) - return; + return 0; sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, @@ -1538,6 +1538,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb_dst_set_noref(skb, dst); } } + return 0; } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5b6690d05abb..0bc9e46a5369 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -991,6 +991,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct tcp_skb_cb *tcb; struct tcp_out_options opts; unsigned int tcp_options_size, tcp_header_size; + struct sk_buff *oskb = NULL; struct tcp_md5sig_key *md5; struct tcphdr *th; int err; @@ -998,12 +999,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); - skb->skb_mstamp = tp->tcp_mstamp; if (clone_it) { TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; - tcp_rate_skb_sent(sk, skb); - + oskb = skb; if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); else @@ -1011,6 +1010,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (unlikely(!skb)) return -ENOBUFS; } + skb->skb_mstamp = tp->tcp_mstamp; inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); @@ -1122,12 +1122,15 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); - if (likely(err <= 0)) - return err; - - tcp_enter_cwr(sk); - - return net_xmit_eval(err); + if (unlikely(err > 0)) { + tcp_enter_cwr(sk); + err = net_xmit_eval(err); + } + if (!err && oskb) { + oskb->skb_mstamp = tp->tcp_mstamp; + tcp_rate_skb_sent(sk, oskb); + } + return err; } /* This routine just queues the buffer for sending. @@ -1803,40 +1806,6 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, return !after(end_seq, tcp_wnd_end(tp)); } -/* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) - * should be put on the wire right now. If so, it returns the number of - * packets allowed by the congestion window. - */ -static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, - unsigned int cur_mss, int nonagle) -{ - const struct tcp_sock *tp = tcp_sk(sk); - unsigned int cwnd_quota; - - tcp_init_tso_segs(skb, cur_mss); - - if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) - return 0; - - cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) - cwnd_quota = 0; - - return cwnd_quota; -} - -/* Test if sending is allowed right now. */ -bool tcp_may_send_now(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = tcp_send_head(sk); - - return skb && - tcp_snd_test(sk, skb, tcp_current_mss(sk), - (tcp_skb_is_last(sk, skb) ? - tp->nonagle : TCP_NAGLE_PUSH)); -} - /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like * tcp_fragment() except that it may make several kinds of assumptions @@ -2869,10 +2838,11 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - skb->skb_mstamp = tp->tcp_mstamp; nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -ENOBUFS; + if (!err) + skb->skb_mstamp = tp->tcp_mstamp; } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -3419,6 +3389,10 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) goto done; } + /* data was not sent, this is our new send_head */ + sk->sk_send_head = syn_data; + tp->packets_out -= tcp_skb_pcount(syn_data); + fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) @@ -3471,6 +3445,11 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; + buff = tcp_send_head(sk); + if (unlikely(buff)) { + tp->snd_nxt = TCP_SKB_CB(buff)->seq; + tp->pushed_seq = TCP_SKB_CB(buff)->seq; + } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ef29df8648e4..5676237d2b0f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2221,9 +2221,10 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, return NULL; } -void udp_v4_early_demux(struct sk_buff *skb) +int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + struct in_device *in_dev = NULL; const struct iphdr *iph; const struct udphdr *uh; struct sock *sk = NULL; @@ -2234,24 +2235,24 @@ void udp_v4_early_demux(struct sk_buff *skb) /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return; + return 0; iph = ip_hdr(skb); uh = udp_hdr(skb); if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) - return; + return 0; /* we are supposed to accept bcast packets */ if (skb->pkt_type == PACKET_MULTICAST) { ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) - return; + return 0; } sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, @@ -2263,7 +2264,7 @@ void udp_v4_early_demux(struct sk_buff *skb) } if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) - return; + return 0; skb->sk = sk; skb->destructor = sock_efree; @@ -2272,12 +2273,23 @@ void udp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst) { + u32 itag = 0; + /* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe() */ skb_dst_set_noref(skb, dst); + + /* for unconnected multicast sockets we need to validate + * the source on each packet + */ + if (!inet_sk(sk)->inet_daddr && in_dev) + return ip_mc_validate_source(skb, iph->daddr, + iph->saddr, iph->tos, + skb->dev, in_dev, &itag); } + return 0; } int udp_rcv(struct sk_buff *skb) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 97658bfc1b58..e360d55be555 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -120,7 +120,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * will be using a length value equal to only one MSS sized * segment instead of the entire frame. */ - if (gso_partial) { + if (gso_partial && skb_is_gso(skb)) { uh->len = htons(skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)uh); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c2e2a78787ec..4a96ebbf8eda 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1399,10 +1399,18 @@ static inline int ipv6_saddr_preferred(int type) return 0; } -static inline bool ipv6_use_optimistic_addr(struct inet6_dev *idev) +static bool ipv6_use_optimistic_addr(struct net *net, + struct inet6_dev *idev) { #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - return idev && idev->cnf.optimistic_dad && idev->cnf.use_optimistic; + if (!idev) + return false; + if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad) + return false; + if (!net->ipv6.devconf_all->use_optimistic && !idev->cnf.use_optimistic) + return false; + + return true; #else return false; #endif @@ -1472,7 +1480,7 @@ static int ipv6_get_saddr_eval(struct net *net, /* Rule 3: Avoid deprecated and optimistic addresses */ u8 avoid = IFA_F_DEPRECATED; - if (!ipv6_use_optimistic_addr(score->ifa->idev)) + if (!ipv6_use_optimistic_addr(net, score->ifa->idev)) avoid |= IFA_F_OPTIMISTIC; ret = ipv6_saddr_preferred(score->addr_type) || !(score->ifa->flags & avoid); @@ -2460,7 +2468,8 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, int max_addresses = in6_dev->cnf.max_addresses; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (in6_dev->cnf.optimistic_dad && + if ((net->ipv6.devconf_all->optimistic_dad || + in6_dev->cnf.optimistic_dad) && !net->ipv6.devconf_all->forwarding && sllao) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3051,7 +3060,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, u32 addr_flags = flags | IFA_F_PERMANENT; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - if (idev->cnf.optimistic_dad && + if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad || + idev->cnf.optimistic_dad) && !dev_net(idev->dev)->ipv6.devconf_all->forwarding) addr_flags |= IFA_F_OPTIMISTIC; #endif @@ -3810,7 +3820,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) goto out; if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || - idev->cnf.accept_dad < 1 || + (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 && + idev->cnf.accept_dad < 1) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { bump_id = ifp->flags & IFA_F_TENTATIVE; @@ -3841,7 +3852,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) */ if (ifp->flags & IFA_F_OPTIMISTIC) { ip6_ins_rt(ifp->rt); - if (ipv6_use_optimistic_addr(idev)) { + if (ipv6_use_optimistic_addr(dev_net(dev), idev)) { /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ @@ -3897,7 +3908,9 @@ static void addrconf_dad_work(struct work_struct *w) action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; - if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6 && + if ((dev_net(idev->dev)->ipv6.devconf_all->accept_dad > 1 || + idev->cnf.accept_dad > 1) && + !idev->cnf.disable_ipv6 && !(ifp->flags & IFA_F_STABLE_PRIVACY)) { struct in6_addr addr; @@ -4940,9 +4953,10 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) /* Don't send DELADDR notification for TENTATIVE address, * since NEWADDR notification is sent only after removing - * TENTATIVE flag. + * TENTATIVE flag, if DAD has not failed. */ - if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR) + if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) && + event == RTM_DELADDR) return; skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index b7a72d409334..1602b491b281 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -940,24 +940,25 @@ done: } static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, - const void *daddr, const void *saddr, unsigned int len) + unsigned short type, const void *daddr, + const void *saddr, unsigned int len) { struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h = skb_push(skb, t->hlen); - __be16 *p = (__be16 *)(ipv6h+1); + struct ipv6hdr *ipv6h; + __be16 *p; - ip6_flow_hdr(ipv6h, 0, - ip6_make_flowlabel(dev_net(dev), skb, - t->fl.u.ip6.flowlabel, true, - &t->fl.u.ip6)); + ipv6h = skb_push(skb, t->hlen + sizeof(*ipv6h)); + ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, + t->fl.u.ip6.flowlabel, + true, &t->fl.u.ip6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; ipv6h->daddr = t->parms.raddr; - p[0] = t->parms.o_flags; - p[1] = htons(type); + p = (__be16 *)(ipv6h + 1); + p[0] = t->parms.o_flags; + p[1] = htons(type); /* * Set the source hardware address. @@ -1310,6 +1311,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); } static bool ip6gre_netlink_encap_parms(struct nlattr *data[], diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index cdb3728faca7..4a87f9428ca5 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -105,7 +105,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); - if (gso_partial) + if (gso_partial && skb_is_gso(skb)) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 10a693a19323..a1c24443cd9e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -171,7 +171,7 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_ } t = rcu_dereference(ip6n->collect_md_tun); - if (t) + if (t && t->dev->flags & IFF_UP) return t; t = rcu_dereference(ip6n->tnls_wc[0]); @@ -1043,6 +1043,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; + unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; bool use_cache = false; @@ -1124,7 +1125,7 @@ route_lookup: t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen; + mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1133,7 +1134,7 @@ route_lookup: mtu = IPV6_MIN_MTU; if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) { + if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; @@ -2259,6 +2260,9 @@ static int __init ip6_tunnel_init(void) { int err; + if (!ipv6_mod_enabled()) + return -EOPNOTSUPP; + err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 79444a4bfd6d..bcdc2d557de1 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -445,6 +445,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; struct xfrm_state *x; + int pkt_len = skb->len; int err = -1; int mtu; @@ -502,7 +503,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += skb->len; + tstats->tx_bytes += pkt_len; tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); } else { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 26cc9f483b6d..a96d5b385d8f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1325,7 +1325,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori struct dst_entry *new = NULL; rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, - DST_OBSOLETE_NONE, 0); + DST_OBSOLETE_DEAD, 0); if (rt) { rt6_info_init(rt); diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 7ff54db73a48..825b8e01f947 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -72,10 +72,6 @@ static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb) srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); - /* make sure it's a Segment Routing header (Routing Type 4) */ - if (srh->type != IPV6_SRCRT_TYPE_4) - return NULL; - len = (srh->hdrlen + 1) << 3; if (!pskb_may_pull(skb, srhoff + len)) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e2ecfb137297..40d7234c27b9 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1015,6 +1015,7 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, */ offset = skb_transport_offset(skb); skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + csum = skb->csum; skb->ip_summed = CHECKSUM_NONE; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index ee485df73ccd..02d61101b108 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1314,6 +1314,9 @@ again: hlist_del_init(&session->hlist); + if (test_and_set_bit(0, &session->dead)) + goto again; + if (session->ref != NULL) (*session->ref)(session); @@ -1685,14 +1688,12 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); /* This function is used by the netlink TUNNEL_DELETE command. */ -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { - l2tp_tunnel_inc_refcount(tunnel); - if (false == queue_work(l2tp_wq, &tunnel->del_work)) { - l2tp_tunnel_dec_refcount(tunnel); - return 1; + if (!test_and_set_bit(0, &tunnel->dead)) { + l2tp_tunnel_inc_refcount(tunnel); + queue_work(l2tp_wq, &tunnel->del_work); } - return 0; } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); @@ -1750,6 +1751,9 @@ EXPORT_SYMBOL_GPL(__l2tp_session_unhash); */ int l2tp_session_delete(struct l2tp_session *session) { + if (test_and_set_bit(0, &session->dead)) + return 0; + if (session->ref) (*session->ref)(session); __l2tp_session_unhash(session); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index a305e0c5925a..67c79d9b5c6c 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -76,6 +76,7 @@ struct l2tp_session_cfg { struct l2tp_session { int magic; /* should be * L2TP_SESSION_MAGIC */ + long dead; struct l2tp_tunnel *tunnel; /* back pointer to tunnel * context */ @@ -160,6 +161,9 @@ struct l2tp_tunnel_cfg { struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ + + unsigned long dead; + struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ bool acpt_newsess; /* Indicates whether this @@ -254,7 +258,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 87da9ef61860..014a7bc2a872 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -44,7 +44,6 @@ struct l2tp_eth { struct net_device *dev; struct sock *tunnel_sock; struct l2tp_session *session; - struct list_head list; atomic_long_t tx_bytes; atomic_long_t tx_packets; atomic_long_t tx_dropped; @@ -58,17 +57,6 @@ struct l2tp_eth_sess { struct net_device *dev; }; -/* per-net private data for this module */ -static unsigned int l2tp_eth_net_id; -struct l2tp_eth_net { - struct list_head l2tp_eth_dev_list; - spinlock_t l2tp_eth_lock; -}; - -static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net) -{ - return net_generic(net, l2tp_eth_net_id); -} static int l2tp_eth_dev_init(struct net_device *dev) { @@ -84,12 +72,6 @@ static int l2tp_eth_dev_init(struct net_device *dev) static void l2tp_eth_dev_uninit(struct net_device *dev) { - struct l2tp_eth *priv = netdev_priv(dev); - struct l2tp_eth_net *pn = l2tp_eth_pernet(dev_net(dev)); - - spin_lock(&pn->l2tp_eth_lock); - list_del_init(&priv->list); - spin_unlock(&pn->l2tp_eth_lock); dev_put(dev); } @@ -273,7 +255,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, struct l2tp_eth *priv; struct l2tp_eth_sess *spriv; int rc; - struct l2tp_eth_net *pn; if (cfg->ifname) { strlcpy(name, cfg->ifname, IFNAMSIZ); @@ -305,7 +286,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, priv = netdev_priv(dev); priv->dev = dev; priv->session = session; - INIT_LIST_HEAD(&priv->list); priv->tunnel_sock = tunnel->sock; session->recv_skb = l2tp_eth_dev_recv; @@ -326,10 +306,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, strlcpy(session->ifname, dev->name, IFNAMSIZ); dev_hold(dev); - pn = l2tp_eth_pernet(dev_net(dev)); - spin_lock(&pn->l2tp_eth_lock); - list_add(&priv->list, &pn->l2tp_eth_dev_list); - spin_unlock(&pn->l2tp_eth_lock); return 0; @@ -342,22 +318,6 @@ out: return rc; } -static __net_init int l2tp_eth_init_net(struct net *net) -{ - struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id); - - INIT_LIST_HEAD(&pn->l2tp_eth_dev_list); - spin_lock_init(&pn->l2tp_eth_lock); - - return 0; -} - -static struct pernet_operations l2tp_eth_net_ops = { - .init = l2tp_eth_init_net, - .id = &l2tp_eth_net_id, - .size = sizeof(struct l2tp_eth_net), -}; - static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = { .session_create = l2tp_eth_create, @@ -371,25 +331,18 @@ static int __init l2tp_eth_init(void) err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops); if (err) - goto out; - - err = register_pernet_device(&l2tp_eth_net_ops); - if (err) - goto out_unreg; + goto err; pr_info("L2TP ethernet pseudowire support (L2TPv3)\n"); return 0; -out_unreg: - l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); -out: +err: return err; } static void __exit l2tp_eth_exit(void) { - unregister_pernet_device(&l2tp_eth_net_ops); l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); } diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 50e3ee9a9d61..bc6e8bfc5be4 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -437,11 +437,11 @@ static void pppol2tp_session_close(struct l2tp_session *session) BUG_ON(session->magic != L2TP_SESSION_MAGIC); - if (sock) { + if (sock) inet_shutdown(sock, SEND_SHUTDOWN); - /* Don't let the session go away before our socket does */ - l2tp_session_inc_refcount(session); - } + + /* Don't let the session go away before our socket does */ + l2tp_session_inc_refcount(session); } /* Really kill the session socket. (Called from sock_put() if diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index f236c0bc7b3f..51063d9ed0f7 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -1041,12 +1041,24 @@ out: static int mtype_head(struct ip_set *set, struct sk_buff *skb) { - const struct htype *h = set->data; + struct htype *h = set->data; const struct htable *t; struct nlattr *nested; size_t memsize; u8 htable_bits; + /* If any members have expired, set->elements will be wrong + * mytype_expire function will update it with the right count. + * we do not hold set->lock here, so grab it first. + * set->elements can still be incorrect in the case of a huge set, + * because elements might time out during the listing. + */ + if (SET_WITH_TIMEOUT(set)) { + spin_lock_bh(&set->lock); + mtype_expire(set, h); + spin_unlock_bh(&set->lock); + } + rcu_read_lock_bh(); t = rcu_dereference_bh_nfnl(h->table); memsize = mtype_ahash_memsize(h, t) + set->ext_size; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index f393a7086025..af8345fc4fbd 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -429,7 +429,7 @@ nf_nat_setup_info(struct nf_conn *ct, srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)]; + lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); @@ -532,9 +532,9 @@ static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) unsigned int h; h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) @@ -807,8 +807,8 @@ static int __init nf_nat_init(void) /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; - if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks)) - nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks); + if (nf_nat_htable_size < CONNTRACK_LOCKS) + nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) @@ -821,7 +821,7 @@ static int __init nf_nat_init(void) return ret; } - for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++) + for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); nf_ct_helper_expectfn_register(&follow_master_nat); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 327807731b44..94c11cf0459d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2270,10 +2270,13 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, mutex_unlock(nlk->cb_mutex); + ret = 0; if (cb->start) - cb->start(cb); + ret = cb->start(cb); + + if (!ret) + ret = netlink_dump(sk); - ret = netlink_dump(sk); sock_put(sk); if (ret) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 76cf273a56c7..c3aec6227c91 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1112,7 +1112,8 @@ static int ovs_nla_init_match_and_action(struct net *net, if (!a[OVS_FLOW_ATTR_KEY]) { OVS_NLERR(log, "Flow key attribute not present in set flow."); - return -EINVAL; + error = -EINVAL; + goto error; } *acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], key, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c26172995511..bec01a3daf5b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1684,10 +1684,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) mutex_lock(&fanout_mutex); - err = -EINVAL; - if (!po->running) - goto out; - err = -EALREADY; if (po->fanout) goto out; @@ -1749,7 +1745,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) list_add(&match->list, &fanout_list); } err = -EINVAL; - if (match->type == type && + + spin_lock(&po->bind_lock); + if (po->running && + match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; @@ -1761,6 +1760,13 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) err = 0; } } + spin_unlock(&po->bind_lock); + + if (err && !refcount_read(&match->sk_ref)) { + list_del(&match->list); + kfree(match); + } + out: if (err && rollover) { kfree(rollover); @@ -2834,6 +2840,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); + bool has_vnet_hdr = false; int hlen, tlen, linear; int extra_len = 0; @@ -2877,6 +2884,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); if (err) goto out_unlock; + has_vnet_hdr = true; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -2935,7 +2943,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->priority = sk->sk_priority; skb->mark = sockc.mark; - if (po->has_vnet_hdr) { + if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; @@ -3063,13 +3071,15 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, int ret = 0; bool unlisted = false; - if (po->fanout) - return -EINVAL; - lock_sock(sk); spin_lock(&po->bind_lock); rcu_read_lock(); + if (po->fanout) { + ret = -EINVAL; + goto out_unlock; + } + if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index a306974e2fb4..da6fa82c98a8 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -53,10 +53,13 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a, res->goto_tp = rcu_dereference_bh(chain->filter_chain); } -static void free_tcf(struct rcu_head *head) +/* XXX: For standalone actions, we don't need a RCU grace period either, because + * actions are always connected to filters and filters are already destroyed in + * RCU callbacks, so after a RCU grace period actions are already disconnected + * from filters. Readers later can not find us. + */ +static void free_tcf(struct tc_action *p) { - struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu); - free_percpu(p->cpu_bstats); free_percpu(p->cpu_qstats); @@ -76,11 +79,7 @@ static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p) idr_remove_ext(&idrinfo->action_idr, p->tcfa_index); spin_unlock_bh(&idrinfo->lock); gen_kill_estimator(&p->tcfa_rate_est); - /* - * gen_estimator est_timer() might access p->tcfa_lock - * or bstats, wait a RCU grace period before freeing p - */ - call_rcu(&p->tcfa_rcu, free_tcf); + free_tcf(p); } int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) @@ -181,7 +180,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, idr_for_each_entry_ext(idr, p, id) { ret = __tcf_idr_release(p, false, true); if (ret == ACT_P_DELETED) { - module_put(p->ops->owner); + module_put(ops->owner); n_i++; } else if (ret < 0) { goto nla_put_failure; @@ -259,7 +258,7 @@ void tcf_idr_cleanup(struct tc_action *a, struct nlattr *est) { if (est) gen_kill_estimator(&a->tcfa_rate_est); - call_rcu(&a->tcfa_rcu, free_tcf); + free_tcf(a); } EXPORT_SYMBOL(tcf_idr_cleanup); @@ -515,13 +514,15 @@ EXPORT_SYMBOL(tcf_action_exec); int tcf_action_destroy(struct list_head *actions, int bind) { + const struct tc_action_ops *ops; struct tc_action *a, *tmp; int ret = 0; list_for_each_entry_safe(a, tmp, actions, list) { + ops = a->ops; ret = __tcf_idr_release(a, bind, true); if (ret == ACT_P_DELETED) - module_put(a->ops->owner); + module_put(ops->owner); else if (ret < 0) return ret; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index c743f03cfebd..0b2219adf520 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -182,7 +182,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, list_add_tail(&chain->list, &block->chain_list); chain->block = block; chain->index = chain_index; - chain->refcnt = 0; + chain->refcnt = 1; return chain; } @@ -194,21 +194,20 @@ static void tcf_chain_flush(struct tcf_chain *chain) RCU_INIT_POINTER(*chain->p_filter_chain, NULL); while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { RCU_INIT_POINTER(chain->filter_chain, tp->next); + tcf_chain_put(chain); tcf_proto_destroy(tp); } } static void tcf_chain_destroy(struct tcf_chain *chain) { - /* May be already removed from the list by the previous call. */ - if (!list_empty(&chain->list)) - list_del_init(&chain->list); + list_del(&chain->list); + kfree(chain); +} - /* There might still be a reference held when we got here from - * tcf_block_put. Wait for the user to drop reference before free. - */ - if (!chain->refcnt) - kfree(chain); +static void tcf_chain_hold(struct tcf_chain *chain) +{ + ++chain->refcnt; } struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, @@ -217,24 +216,19 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, struct tcf_chain *chain; list_for_each_entry(chain, &block->chain_list, list) { - if (chain->index == chain_index) - goto incref; + if (chain->index == chain_index) { + tcf_chain_hold(chain); + return chain; + } } - chain = create ? tcf_chain_create(block, chain_index) : NULL; -incref: - if (chain) - chain->refcnt++; - return chain; + return create ? tcf_chain_create(block, chain_index) : NULL; } EXPORT_SYMBOL(tcf_chain_get); void tcf_chain_put(struct tcf_chain *chain) { - /* Destroy unused chain, with exception of chain 0, which is the - * default one and has to be always present. - */ - if (--chain->refcnt == 0 && !chain->filter_chain && chain->index != 0) + if (--chain->refcnt == 0) tcf_chain_destroy(chain); } EXPORT_SYMBOL(tcf_chain_put); @@ -279,10 +273,31 @@ void tcf_block_put(struct tcf_block *block) if (!block) return; - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { + /* XXX: Standalone actions are not allowed to jump to any chain, and + * bound actions should be all removed after flushing. However, + * filters are destroyed in RCU callbacks, we have to hold the chains + * first, otherwise we would always race with RCU callbacks on this list + * without proper locking. + */ + + /* Wait for existing RCU callbacks to cool down. */ + rcu_barrier(); + + /* Hold a refcnt for all chains, except 0, in case they are gone. */ + list_for_each_entry(chain, &block->chain_list, list) + if (chain->index) + tcf_chain_hold(chain); + + /* No race on the list, because no chain could be destroyed. */ + list_for_each_entry(chain, &block->chain_list, list) tcf_chain_flush(chain); - tcf_chain_destroy(chain); - } + + /* Wait for RCU callbacks to release the reference count. */ + rcu_barrier(); + + /* At this point, all the chains should have refcnt == 1. */ + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + tcf_chain_put(chain); kfree(block); } EXPORT_SYMBOL(tcf_block_put); @@ -360,6 +375,7 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain, rcu_assign_pointer(*chain->p_filter_chain, tp); RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info)); rcu_assign_pointer(*chain_info->pprev, tp); + tcf_chain_hold(chain); } static void tcf_chain_tp_remove(struct tcf_chain *chain, @@ -371,6 +387,7 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain, if (chain->p_filter_chain && tp == chain->filter_chain) RCU_INIT_POINTER(*chain->p_filter_chain, next); RCU_INIT_POINTER(*chain_info->pprev, next); + tcf_chain_put(chain); } static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1a267e77c6de..d230cb4c8094 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -922,28 +922,28 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!tc_flags_valid(fnew->flags)) { err = -EINVAL; - goto errout; + goto errout_idr; } } err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); if (err) - goto errout; + goto errout_idr; err = fl_check_assign_mask(head, &mask); if (err) - goto errout; + goto errout_idr; if (!tc_skip_sw(fnew->flags)) { if (!fold && fl_lookup(head, &fnew->mkey)) { err = -EEXIST; - goto errout; + goto errout_idr; } err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, head->ht_params); if (err) - goto errout; + goto errout_idr; } if (!tc_skip_hw(fnew->flags)) { @@ -952,7 +952,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, &mask.key, fnew); if (err) - goto errout; + goto errout_idr; } if (!tc_in_hw(fnew->flags)) @@ -981,6 +981,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, kfree(tb); return 0; +errout_idr: + if (fnew->handle) + idr_remove_ext(&head->handle_idr, fnew->handle); errout: tcf_exts_destroy(&fnew->exts); kfree(fnew); diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 21cc45caf842..eeac606c95ab 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -32,6 +32,7 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (tc_skip_sw(head->flags)) return -1; + *res = head->res; return tcf_exts_exec(skb, &head->exts, res); } diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 98c05db85bcb..b1f6ed48bc72 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -389,7 +389,7 @@ static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt) if ((data->hgenerator += 0x10000) == 0) data->hgenerator = 0x10000; h = data->hgenerator|salt; - if (rsvp_get(tp, h) == 0) + if (!rsvp_get(tp, h)) return h; } return 0; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 92237e75dbbc..bf8c81e07c70 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -685,6 +685,7 @@ void qdisc_reset(struct Qdisc *qdisc) qdisc->gso_skb = NULL; } qdisc->q.qlen = 0; + qdisc->qstats.backlog = 0; } EXPORT_SYMBOL(qdisc_reset); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index daaf214e5201..3f88b75488b0 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -958,6 +958,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } if (cl != NULL) { + int old_flags; + if (parentid) { if (cl->cl_parent && cl->cl_parent->cl_common.classid != parentid) @@ -978,6 +980,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, } sch_tree_lock(sch); + old_flags = cl->cl_flags; + if (rsc != NULL) hfsc_change_rsc(cl, rsc, cur_time); if (fsc != NULL) @@ -986,10 +990,21 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, hfsc_change_usc(cl, usc, cur_time); if (cl->qdisc->q.qlen != 0) { - if (cl->cl_flags & HFSC_RSC) - update_ed(cl, qdisc_peek_len(cl->qdisc)); - if (cl->cl_flags & HFSC_FSC) - update_vf(cl, 0, cur_time); + int len = qdisc_peek_len(cl->qdisc); + + if (cl->cl_flags & HFSC_RSC) { + if (old_flags & HFSC_RSC) + update_ed(cl, len); + else + init_ed(cl, len); + } + + if (cl->cl_flags & HFSC_FSC) { + if (old_flags & HFSC_FSC) + update_vf(cl, 0, cur_time); + else + init_vf(cl, len); + } } sch_tree_unlock(sch); diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index e99518e79b52..a72a7d925d46 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -279,9 +279,11 @@ out: return err; } -static int sctp_sock_dump(struct sock *sk, void *p) +static int sctp_sock_dump(struct sctp_transport *tsp, void *p) { + struct sctp_endpoint *ep = tsp->asoc->ep; struct sctp_comm_param *commp = p; + struct sock *sk = ep->base.sk; struct sk_buff *skb = commp->skb; struct netlink_callback *cb = commp->cb; const struct inet_diag_req_v2 *r = commp->r; @@ -289,9 +291,7 @@ static int sctp_sock_dump(struct sock *sk, void *p) int err = 0; lock_sock(sk); - if (!sctp_sk(sk)->ep) - goto release; - list_for_each_entry(assoc, &sctp_sk(sk)->ep->asocs, asocs) { + list_for_each_entry(assoc, &ep->asocs, asocs) { if (cb->args[4] < cb->args[1]) goto next; @@ -309,7 +309,6 @@ static int sctp_sock_dump(struct sock *sk, void *p) cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, commp->net_admin) < 0) { - cb->args[3] = 1; err = 1; goto release; } @@ -327,40 +326,30 @@ next: cb->args[4]++; } cb->args[1] = 0; - cb->args[2]++; cb->args[3] = 0; cb->args[4] = 0; release: release_sock(sk); - sock_put(sk); return err; } -static int sctp_get_sock(struct sctp_transport *tsp, void *p) +static int sctp_sock_filter(struct sctp_transport *tsp, void *p) { struct sctp_endpoint *ep = tsp->asoc->ep; struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; - struct netlink_callback *cb = commp->cb; const struct inet_diag_req_v2 *r = commp->r; struct sctp_association *assoc = list_entry(ep->asocs.next, struct sctp_association, asocs); /* find the ep only once through the transports by this condition */ if (tsp->asoc != assoc) - goto out; + return 0; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) - goto out; - - sock_hold(sk); - cb->args[5] = (long)sk; + return 0; return 1; - -out: - cb->args[2]++; - return 0; } static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) @@ -474,6 +463,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, .r = r, .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), }; + int pos = cb->args[2]; /* eps hashtable dumps * args: @@ -503,12 +493,9 @@ skip: if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE))) goto done; -next: - cb->args[5] = 0; - sctp_for_each_transport(sctp_get_sock, net, cb->args[2], &commp); - - if (cb->args[5] && !sctp_sock_dump((struct sock *)cb->args[5], &commp)) - goto next; + sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump, + net, &pos, &commp); + cb->args[2] = pos; done: cb->args[1] = cb->args[4]; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 1b00a1e09b93..d4730ada7f32 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4658,29 +4658,39 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *), EXPORT_SYMBOL_GPL(sctp_transport_lookup_process); int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *), - struct net *net, int pos, void *p) { + int (*cb_done)(struct sctp_transport *, void *), + struct net *net, int *pos, void *p) { struct rhashtable_iter hti; - void *obj; - int err; - - err = sctp_transport_walk_start(&hti); - if (err) - return err; + struct sctp_transport *tsp; + int ret; - obj = sctp_transport_get_idx(net, &hti, pos + 1); - for (; !IS_ERR_OR_NULL(obj); obj = sctp_transport_get_next(net, &hti)) { - struct sctp_transport *transport = obj; +again: + ret = sctp_transport_walk_start(&hti); + if (ret) + return ret; - if (!sctp_transport_hold(transport)) + tsp = sctp_transport_get_idx(net, &hti, *pos + 1); + for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) { + if (!sctp_transport_hold(tsp)) continue; - err = cb(transport, p); - sctp_transport_put(transport); - if (err) + ret = cb(tsp, p); + if (ret) break; + (*pos)++; + sctp_transport_put(tsp); } sctp_transport_walk_stop(&hti); - return err; + if (ret) { + if (cb_done && !cb_done(tsp, p)) { + (*pos)++; + sctp_transport_put(tsp); + goto again; + } + sctp_transport_put(tsp); + } + + return ret; } EXPORT_SYMBOL_GPL(sctp_for_each_transport); diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8c6d24b2995d..745f145d4c4d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -282,6 +282,7 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, u8 *prefix_len) { struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct in_device *in_dev; struct sockaddr_in addr; int rc = -ENOENT; int len; @@ -298,14 +299,17 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, /* get address to which the internal TCP socket is bound */ kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); /* analyze IPv4 specific data of net_device belonging to TCP socket */ - for_ifa(dst->dev->ip_ptr) { - if (ifa->ifa_address != addr.sin_addr.s_addr) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dst->dev); + for_ifa(in_dev) { + if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) continue; *prefix_len = inet_mask_len(ifa->ifa_mask); *subnet = ifa->ifa_address & ifa->ifa_mask; rc = 0; break; - } endfor_ifa(dst->dev->ip_ptr); + } endfor_ifa(in_dev); + rcu_read_unlock(); out_rel: dst_release(dst); @@ -509,7 +513,7 @@ decline_rdma: /* RDMA setup failed, switch back to TCP */ smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(smc, reason_code, 0); + rc = smc_clc_send_decline(smc, reason_code); if (rc < sizeof(struct smc_clc_msg_decline)) goto out_err; } @@ -804,8 +808,6 @@ static void smc_listen_work(struct work_struct *work) rc = local_contact; if (rc == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (rc == -ENOLINK) - reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ goto decline_rdma; } link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; @@ -899,7 +901,7 @@ decline_rdma: smc_conn_free(&new_smc->conn); new_smc->use_fallback = true; if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(new_smc, reason_code, 0); + rc = smc_clc_send_decline(new_smc, reason_code); if (rc < sizeof(struct smc_clc_msg_decline)) goto out_err; } diff --git a/net/smc/smc.h b/net/smc/smc.h index 6e44313e4467..0ccd6fa387ad 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -149,7 +149,7 @@ struct smc_connection { atomic_t sndbuf_space; /* remaining space in sndbuf */ u16 tx_cdc_seq; /* sequence # for CDC send */ spinlock_t send_lock; /* protect wr_sends */ - struct work_struct tx_work; /* retry of smc_cdc_msg_send */ + struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 3934913ab835..b7dd2743fb5c 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -95,9 +95,10 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } if (clcm->type == SMC_CLC_DECLINE) { reason_code = SMC_CLC_DECL_REPLY; - if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis) - == SMC_CLC_DECL_SYNCERR) + if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { smc->conn.lgr->sync_err = true; + smc_lgr_terminate(smc->conn.lgr); + } } out: @@ -105,8 +106,7 @@ out: } /* send CLC DECLINE message across internal TCP socket */ -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, - u8 out_of_sync) +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) { struct smc_clc_msg_decline dclc; struct msghdr msg; @@ -118,7 +118,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, dclc.hdr.type = SMC_CLC_DECLINE; dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); dclc.hdr.version = SMC_CLC_V1; - dclc.hdr.flag = out_of_sync ? 1 : 0; + dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0; memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 13db8ce177c9..1c55414041d4 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -106,8 +106,7 @@ struct smc_ib_device; int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); -int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, - u8 out_of_sync); +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport); int smc_clc_send_confirm(struct smc_sock *smc); diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 3c2e166b5d22..f0d16fb825f7 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -174,15 +174,15 @@ int smc_close_active(struct smc_sock *smc) { struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; - long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; int old_state; + long timeout; int rc = 0; - if (sock_flag(sk, SOCK_LINGER) && - !(current->flags & PF_EXITING)) - timeout = sk->sk_lingertime; + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; again: old_state = sk->sk_state; @@ -208,7 +208,7 @@ again: case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_state == SMC_ACTIVE) { /* send close request */ @@ -234,7 +234,7 @@ again: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); if (sk->sk_err != ECONNABORTED) { /* confirm close from peer */ @@ -263,7 +263,9 @@ again: /* peer sending PeerConnectionClosed will cause transition */ break; case SMC_PROCESSABORT: - cancel_work_sync(&conn->tx_work); + release_sock(sk); + cancel_delayed_work_sync(&conn->tx_work); + lock_sock(sk); smc_close_abort(conn); sk->sk_state = SMC_CLOSED; smc_close_wait_tx_pends(smc); @@ -411,13 +413,14 @@ void smc_close_sock_put_work(struct work_struct *work) int smc_close_shutdown_write(struct smc_sock *smc) { struct smc_connection *conn = &smc->conn; - long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; struct sock *sk = &smc->sk; int old_state; + long timeout; int rc = 0; - if (sock_flag(sk, SOCK_LINGER)) - timeout = sk->sk_lingertime; + timeout = current->flags & PF_EXITING ? + 0 : sock_flag(sk, SOCK_LINGER) ? + sk->sk_lingertime : SMC_MAX_STREAM_WAIT_TIMEOUT; again: old_state = sk->sk_state; @@ -425,7 +428,7 @@ again: case SMC_ACTIVE: smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); /* send close wr request */ rc = smc_close_wr(conn); @@ -439,7 +442,7 @@ again: if (!smc_cdc_rxed_any_close(conn)) smc_close_stream_wait(smc, timeout); release_sock(sk); - cancel_work_sync(&conn->tx_work); + cancel_delayed_work_sync(&conn->tx_work); lock_sock(sk); /* confirm close from peer */ rc = smc_close_wr(conn); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 1a16d51e2330..20b66e79c5d6 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -25,8 +25,9 @@ #include "smc_cdc.h" #include "smc_close.h" -#define SMC_LGR_NUM_INCR 256 -#define SMC_LGR_FREE_DELAY (600 * HZ) +#define SMC_LGR_NUM_INCR 256 +#define SMC_LGR_FREE_DELAY_SERV (600 * HZ) +#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10) static u32 smc_lgr_num; /* unique link group number */ @@ -107,8 +108,15 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); - if (reduced && !lgr->conns_num) - schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY); + if (!reduced || lgr->conns_num) + return; + /* client link group creation always follows the server link group + * creation. For client use a somewhat higher removal delay time, + * otherwise there is a risk of out-of-sync link groups. + */ + mod_delayed_work(system_wq, &lgr->free_work, + lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); } static void smc_lgr_free_work(struct work_struct *work) diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 547e0e113b17..0b5852299158 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -380,6 +380,7 @@ static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); if (ndev) { memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); + dev_put(ndev); } else if (!rc) { memcpy(&smcibdev->mac[ibport - 1][0], &smcibdev->gid[ibport - 1].raw[8], 3); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 78f7af28ae4f..31f8453c25c5 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -181,8 +181,10 @@ static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem) sizeof(new_pnetelem->ndev->name)) || smc_pnet_same_ibname(pnetelem, new_pnetelem->smcibdev->ibdev->name, - new_pnetelem->ib_port)) + new_pnetelem->ib_port)) { + dev_put(pnetelem->ndev); goto found; + } } list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist); rc = 0; diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index b17a333e9bb0..3e631ae4b6b6 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -148,6 +148,8 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, read_done = sock_intr_errno(timeo); break; } + if (!timeo) + return -EAGAIN; } if (!atomic_read(&conn->bytes_to_rcv)) { diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 3c656beb8820..3866573288dd 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -24,6 +24,8 @@ #include "smc_cdc.h" #include "smc_tx.h" +#define SMC_TX_WORK_DELAY HZ + /***************************** sndbuf producer *******************************/ /* callback implementation for sk.sk_write_space() @@ -406,7 +408,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) goto out_unlock; } rc = 0; - schedule_work(&conn->tx_work); + schedule_delayed_work(&conn->tx_work, + SMC_TX_WORK_DELAY); } goto out_unlock; } @@ -430,7 +433,7 @@ out_unlock: */ static void smc_tx_work(struct work_struct *work) { - struct smc_connection *conn = container_of(work, + struct smc_connection *conn = container_of(to_delayed_work(work), struct smc_connection, tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -468,7 +471,8 @@ void smc_tx_consumer_update(struct smc_connection *conn) if (!rc) rc = smc_cdc_msg_send(conn, wr_buf, pend); if (rc < 0) { - schedule_work(&conn->tx_work); + schedule_delayed_work(&conn->tx_work, + SMC_TX_WORK_DELAY); return; } smc_curs_write(&conn->rx_curs_confirmed, @@ -487,6 +491,6 @@ void smc_tx_consumer_update(struct smc_connection *conn) void smc_tx_init(struct smc_sock *smc) { smc->sk.sk_write_space = smc_tx_write_space; - INIT_WORK(&smc->conn.tx_work, smc_tx_work); + INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); spin_lock_init(&smc->conn.send_lock); } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index ab56bda66783..525d91e0d57e 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -244,7 +244,7 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) int rc; ib_req_notify_cq(link->smcibdev->roce_cq_send, - IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); pend = container_of(priv, struct smc_wr_tx_pend, priv); rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], &failed_wr); diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index ac701c28f44f..c2c68a15b59d 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -171,10 +171,10 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs) /* * Add the temporary list to the backchannel preallocation list */ - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_splice(&tmp_list, &xprt->bc_pa_list); xprt_inc_alloc_count(xprt, min_reqs); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); dprintk("RPC: setup backchannel transport done\n"); return 0; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2e49d1f892b7..2ad827db2704 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1903,6 +1903,14 @@ call_connect_status(struct rpc_task *task) task->tk_status = 0; switch (status) { case -ECONNREFUSED: + /* A positive refusal suggests a rebind is needed. */ + if (RPC_IS_SOFTCONN(task)) + break; + if (clnt->cl_autobind) { + rpc_force_rebind(clnt); + task->tk_action = call_bind; + return; + } case -ECONNRESET: case -ECONNABORTED: case -ENETUNREACH: @@ -2139,10 +2147,6 @@ call_status(struct rpc_task *task) rpc_delay(task, 3*HZ); case -ETIMEDOUT: task->tk_action = call_timeout; - if (!(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) - && task->tk_client->cl_discrtry) - xprt_conditional_disconnect(req->rq_xprt, - req->rq_connect_cookie); break; case -ECONNREFUSED: case -ECONNRESET: diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 399fab5d1936..ff8e06cd067e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1013,7 +1013,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) if (!bc_xprt) return -EAGAIN; - spin_lock_bh(&bc_xprt->transport_lock); + spin_lock(&bc_xprt->recv_lock); req = xprt_lookup_rqst(bc_xprt, xid); if (!req) goto unlock_notfound; @@ -1031,7 +1031,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) memcpy(dst->iov_base, src->iov_base, src->iov_len); xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len); rqstp->rq_arg.len = 0; - spin_unlock_bh(&bc_xprt->transport_lock); + spin_unlock(&bc_xprt->recv_lock); return 0; unlock_notfound: printk(KERN_NOTICE @@ -1040,7 +1040,7 @@ unlock_notfound: __func__, ntohl(calldir), bc_xprt, ntohl(xid)); unlock_eagain: - spin_unlock_bh(&bc_xprt->transport_lock); + spin_unlock(&bc_xprt->recv_lock); return -EAGAIN; } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 4654a9934269..e741ec2b4d8e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -844,6 +844,50 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) } EXPORT_SYMBOL_GPL(xprt_lookup_rqst); +/** + * xprt_pin_rqst - Pin a request on the transport receive list + * @req: Request to pin + * + * Caller must ensure this is atomic with the call to xprt_lookup_rqst() + * so should be holding the xprt transport lock. + */ +void xprt_pin_rqst(struct rpc_rqst *req) +{ + set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate); +} +EXPORT_SYMBOL_GPL(xprt_pin_rqst); + +/** + * xprt_unpin_rqst - Unpin a request on the transport receive list + * @req: Request to pin + * + * Caller should be holding the xprt transport lock. + */ +void xprt_unpin_rqst(struct rpc_rqst *req) +{ + struct rpc_task *task = req->rq_task; + + clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate); + if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate)) + wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV); +} +EXPORT_SYMBOL_GPL(xprt_unpin_rqst); + +static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req) +__must_hold(&req->rq_xprt->recv_lock) +{ + struct rpc_task *task = req->rq_task; + + if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) { + spin_unlock(&req->rq_xprt->recv_lock); + set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); + wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV, + TASK_UNINTERRUPTIBLE); + clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); + spin_lock(&req->rq_xprt->recv_lock); + } +} + static void xprt_update_rtt(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; @@ -966,13 +1010,13 @@ void xprt_transmit(struct rpc_task *task) /* * Add to the list only if we're expecting a reply */ - spin_lock_bh(&xprt->transport_lock); /* Update the softirq receive buffer */ memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(req->rq_private_buf)); /* Add request to the receive list */ + spin_lock(&xprt->recv_lock); list_add_tail(&req->rq_list, &xprt->recv); - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); xprt_reset_majortimeo(req); /* Turn off autodisconnect */ del_singleshot_timer_sync(&xprt->timer); @@ -1287,12 +1331,16 @@ void xprt_release(struct rpc_task *task) task->tk_ops->rpc_count_stats(task, task->tk_calldata); else if (task->tk_client) rpc_count_iostats(task, task->tk_client->cl_metrics); + spin_lock(&xprt->recv_lock); + if (!list_empty(&req->rq_list)) { + list_del(&req->rq_list); + xprt_wait_on_pinned_rqst(req); + } + spin_unlock(&xprt->recv_lock); spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) xprt->ops->release_request(task); - if (!list_empty(&req->rq_list)) - list_del(&req->rq_list); xprt->last_used = jiffies; xprt_schedule_autodisconnect(xprt); spin_unlock_bh(&xprt->transport_lock); @@ -1318,6 +1366,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->reserve_lock); + spin_lock_init(&xprt->recv_lock); INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 03f6b5840764..d31d0ac5ada9 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -49,6 +49,7 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, if (IS_ERR(rb)) goto out_fail; req->rl_rdmabuf = rb; + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); size = r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); @@ -202,20 +203,24 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) */ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) { - struct rpc_xprt *xprt = rqst->rq_xprt; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_msg *headerp; - - headerp = rdmab_to_msg(req->rl_rdmabuf); - headerp->rm_xid = rqst->rq_xid; - headerp->rm_vers = rpcrdma_version; - headerp->rm_credit = - cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests); - headerp->rm_type = rdma_msg; - headerp->rm_body.rm_chunks[0] = xdr_zero; - headerp->rm_body.rm_chunks[1] = xdr_zero; - headerp->rm_body.rm_chunks[2] = xdr_zero; + __be32 *p; + + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); + xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf, + req->rl_rdmabuf->rg_base); + + p = xdr_reserve_space(&req->rl_stream, 28); + if (unlikely(!p)) + return -EIO; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests); + *p++ = rdma_msg; + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN, &rqst->rq_snd_buf, rpcrdma_noch)) @@ -271,9 +276,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) * @xprt: transport receiving the call * @rep: receive buffer containing the call * - * Called in the RPC reply handler, which runs in a tasklet. - * Be quick about it. - * * Operational assumptions: * o Backchannel credits are ignored, just as the NFS server * forechannel currently does @@ -284,7 +286,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) { struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct rpcrdma_msg *headerp; struct svc_serv *bc_serv; struct rpcrdma_req *req; struct rpc_rqst *rqst; @@ -292,24 +293,15 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, size_t size; __be32 *p; - headerp = rdmab_to_msg(rep->rr_rdmabuf); + p = xdr_inline_decode(&rep->rr_stream, 0); + size = xdr_stream_remaining(&rep->rr_stream); + #ifdef RPCRDMA_BACKCHANNEL_DEBUG pr_info("RPC: %s: callback XID %08x, length=%u\n", - __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len); - pr_info("RPC: %s: %*ph\n", __func__, rep->rr_len, headerp); + __func__, be32_to_cpup(p), size); + pr_info("RPC: %s: %*ph\n", __func__, size, p); #endif - /* Sanity check: - * Need at least enough bytes for RPC/RDMA header, as code - * here references the header fields by array offset. Also, - * backward calls are always inline, so ensure there - * are some bytes beyond the RPC/RDMA header. - */ - if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24) - goto out_short; - p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN); - size = rep->rr_len - RPCRDMA_HDRLEN_MIN; - /* Grab a free bc rqst */ spin_lock(&xprt->bc_pa_lock); if (list_empty(&xprt->bc_pa_list)) { @@ -325,7 +317,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, /* Prepare rqst */ rqst->rq_reply_bytes_recvd = 0; rqst->rq_bytes_sent = 0; - rqst->rq_xid = headerp->rm_xid; + rqst->rq_xid = *p; rqst->rq_private_buf.len = size; set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); @@ -337,9 +329,9 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, buf->len = size; /* The receive buffer has to be hooked to the rpcrdma_req - * so that it can be reposted after the server is done - * parsing it but just before sending the backward - * direction reply. + * so that it is not released while the req is pointing + * to its buffer, and so that it can be reposted after + * the Upper Layer is done decoding it. */ req = rpcr_to_rdmar(rqst); dprintk("RPC: %s: attaching rep %p to req %p\n", @@ -367,13 +359,4 @@ out_overflow: * when the connection is re-established. */ return; - -out_short: - pr_warn("RPC/RDMA short backward direction call\n"); - - if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) - xprt_disconnect_done(xprt); - else - pr_warn("RPC: %s: reposting rep %p\n", - __func__, rep); } diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index d3f84bb1d443..6c7151341194 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -177,7 +177,7 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) /* Use the ib_map_phys_fmr() verb to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ -static int +static struct rpcrdma_mr_seg * fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, struct rpcrdma_mw **out) { @@ -188,7 +188,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mw = rpcrdma_get_mw(r_xprt); if (!mw) - return -ENOBUFS; + return ERR_PTR(-ENOBUFS); pageoff = offset_in_page(seg1->mr_offset); seg1->mr_offset -= pageoff; /* start of page */ @@ -232,13 +232,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mw->mw_offset = dma_pages[0] + pageoff; *out = mw; - return mw->mw_nents; + return seg; out_dmamap_err: pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", mw->mw_sg, i); rpcrdma_put_mw(r_xprt, mw); - return -EIO; + return ERR_PTR(-EIO); out_maperr: pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", @@ -247,7 +247,7 @@ out_maperr: ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); rpcrdma_put_mw(r_xprt, mw); - return -EIO; + return ERR_PTR(-EIO); } /* Invalidate all memory regions that were registered for "req". diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 6aea36a38bfd..df062e086bdb 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -344,7 +344,7 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) /* Post a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ -static int +static struct rpcrdma_mr_seg * frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, struct rpcrdma_mw **out) { @@ -364,7 +364,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, rpcrdma_defer_mr_recovery(mw); mw = rpcrdma_get_mw(r_xprt); if (!mw) - return -ENOBUFS; + return ERR_PTR(-ENOBUFS); } while (mw->frmr.fr_state != FRMR_IS_INVALID); frmr = &mw->frmr; frmr->fr_state = FRMR_IS_VALID; @@ -401,7 +401,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (unlikely(n != mw->mw_nents)) goto out_mapmr_err; - dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", + dprintk("RPC: %s: Using frmr %p to map %u segments (%llu bytes)\n", __func__, frmr, mw->mw_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); @@ -429,25 +429,25 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mw->mw_offset = mr->iova; *out = mw; - return mw->mw_nents; + return seg; out_dmamap_err: pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", mw->mw_sg, i); frmr->fr_state = FRMR_IS_INVALID; rpcrdma_put_mw(r_xprt, mw); - return -EIO; + return ERR_PTR(-EIO); out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", frmr->fr_mr, n, mw->mw_nents); rpcrdma_defer_mr_recovery(mw); - return -EIO; + return ERR_PTR(-EIO); out_senderr: pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); rpcrdma_defer_mr_recovery(mw); - return -ENOTCONN; + return ERR_PTR(-ENOTCONN); } /* Invalidate all memory regions that were registered for "req". diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index ca4d6e4528f3..f1889f4d4803 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -169,40 +169,41 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; } -/* Split "vec" on page boundaries into segments. FMR registers pages, - * not a byte range. Other modes coalesce these segments into a single - * MR when they can. +/* Split @vec on page boundaries into SGEs. FMR registers pages, not + * a byte range. Other modes coalesce these SGEs into a single MR + * when they can. + * + * Returns pointer to next available SGE, and bumps the total number + * of SGEs consumed. */ -static int -rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) +static struct rpcrdma_mr_seg * +rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, + unsigned int *n) { - size_t page_offset; - u32 remaining; + u32 remaining, page_offset; char *base; base = vec->iov_base; page_offset = offset_in_page(base); remaining = vec->iov_len; - while (remaining && n < RPCRDMA_MAX_SEGS) { - seg[n].mr_page = NULL; - seg[n].mr_offset = base; - seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); - remaining -= seg[n].mr_len; - base += seg[n].mr_len; - ++n; + while (remaining) { + seg->mr_page = NULL; + seg->mr_offset = base; + seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); + remaining -= seg->mr_len; + base += seg->mr_len; + ++seg; + ++(*n); page_offset = 0; } - return n; + return seg; } -/* - * Chunk assembly from upper layer xdr_buf. - * - * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk - * elements. Segments are then coalesced when registered, if possible - * within the selected memreg mode. +/* Convert @xdrbuf into SGEs no larger than a page each. As they + * are registered, these SGEs are then coalesced into RDMA segments + * when the selected memreg mode supports it. * - * Returns positive number of segments converted, or a negative errno. + * Returns positive number of SGEs consumed, or a negative errno. */ static int @@ -210,47 +211,41 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, unsigned int pos, enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) { - int len, n, p, page_base; + unsigned long page_base; + unsigned int len, n; struct page **ppages; n = 0; - if (pos == 0) { - n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n); - if (n == RPCRDMA_MAX_SEGS) - goto out_overflow; - } + if (pos == 0) + seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); len = xdrbuf->page_len; ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = offset_in_page(xdrbuf->page_base); - p = 0; - while (len && n < RPCRDMA_MAX_SEGS) { - if (!ppages[p]) { - /* alloc the pagelist for receiving buffer */ - ppages[p] = alloc_page(GFP_ATOMIC); - if (!ppages[p]) + while (len) { + if (unlikely(!*ppages)) { + /* XXX: Certain upper layer operations do + * not provide receive buffer pages. + */ + *ppages = alloc_page(GFP_ATOMIC); + if (!*ppages) return -EAGAIN; } - seg[n].mr_page = ppages[p]; - seg[n].mr_offset = (void *)(unsigned long) page_base; - seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); - if (seg[n].mr_len > PAGE_SIZE) - goto out_overflow; - len -= seg[n].mr_len; + seg->mr_page = *ppages; + seg->mr_offset = (char *)page_base; + seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); + len -= seg->mr_len; + ++ppages; + ++seg; ++n; - ++p; - page_base = 0; /* page offset only applies to first page */ + page_base = 0; } - /* Message overflows the seg array */ - if (len && n == RPCRDMA_MAX_SEGS) - goto out_overflow; - /* When encoding a Read chunk, the tail iovec contains an * XDR pad and may be omitted. */ if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) - return n; + goto out; /* When encoding a Write chunk, some servers need to see an * extra segment for non-XDR-aligned Write chunks. The upper @@ -258,30 +253,81 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, * for this purpose. */ if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) - return n; + goto out; - if (xdrbuf->tail[0].iov_len) { - n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); - if (n == RPCRDMA_MAX_SEGS) - goto out_overflow; - } + if (xdrbuf->tail[0].iov_len) + seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); +out: + if (unlikely(n > RPCRDMA_MAX_SEGS)) + return -EIO; return n; +} -out_overflow: - pr_err("rpcrdma: segment array overflow\n"); - return -EIO; +static inline int +encode_item_present(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + *p = xdr_one; + return 0; } -static inline __be32 * +static inline int +encode_item_not_present(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + *p = xdr_zero; + return 0; +} + +static void xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) { *iptr++ = cpu_to_be32(mw->mw_handle); *iptr++ = cpu_to_be32(mw->mw_length); - return xdr_encode_hyper(iptr, mw->mw_offset); + xdr_encode_hyper(iptr, mw->mw_offset); } -/* XDR-encode the Read list. Supports encoding a list of read +static int +encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4 * sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + xdr_encode_rdma_segment(p, mw); + return 0; +} + +static int +encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw, + u32 position) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 6 * sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + *p++ = xdr_one; /* Item present */ + *p++ = cpu_to_be32(position); + xdr_encode_rdma_segment(p, mw); + return 0; +} + +/* Register and XDR encode the Read list. Supports encoding a list of read * segments that belong to a single read chunk. * * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): @@ -290,23 +336,20 @@ xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) * N elements, position P (same P for all chunks of same arg!): * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 * - * Returns a pointer to the XDR word in the RDMA header following - * the end of the Read list, or an error pointer. + * Returns zero on success, or a negative errno if a failure occurred. + * @xdr is advanced to the next position in the stream. + * + * Only a single @pos value is currently supported. */ -static __be32 * -rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, struct rpc_rqst *rqst, - __be32 *iptr, enum rpcrdma_chunktype rtype) +static noinline int +rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) { + struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; unsigned int pos; - int n, nsegs; - - if (rtype == rpcrdma_noch) { - *iptr++ = xdr_zero; /* item not present */ - return iptr; - } + int nsegs; pos = rqst->rq_snd_buf.head[0].iov_len; if (rtype == rpcrdma_areadch) @@ -315,40 +358,33 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, rtype, seg); if (nsegs < 0) - return ERR_PTR(nsegs); + return nsegs; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - false, &mw); - if (n < 0) - return ERR_PTR(n); + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + false, &mw); + if (IS_ERR(seg)) + return PTR_ERR(seg); rpcrdma_push_mw(mw, &req->rl_registered); - *iptr++ = xdr_one; /* item present */ - - /* All read segments in this chunk - * have the same "position". - */ - *iptr++ = cpu_to_be32(pos); - iptr = xdr_encode_rdma_segment(iptr, mw); + if (encode_read_segment(xdr, mw, pos) < 0) + return -EMSGSIZE; dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, pos, mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, n < nsegs ? "more" : "last"); + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); r_xprt->rx_stats.read_chunk_count++; - seg += n; - nsegs -= n; + nsegs -= mw->mw_nents; } while (nsegs); - /* Finish Read list */ - *iptr++ = xdr_zero; /* Next item not present */ - return iptr; + return 0; } -/* XDR-encode the Write list. Supports encoding a list containing - * one array of plain segments that belong to a single write chunk. +/* Register and XDR encode the Write list. Supports encoding a list + * containing one array of plain segments that belong to a single + * write chunk. * * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): * @@ -356,66 +392,65 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, * N elements: * 1 - N - HLOO - HLOO - ... - HLOO - 0 * - * Returns a pointer to the XDR word in the RDMA header following - * the end of the Write list, or an error pointer. + * Returns zero on success, or a negative errno if a failure occurred. + * @xdr is advanced to the next position in the stream. + * + * Only a single Write chunk is currently supported. */ -static __be32 * +static noinline int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, __be32 *iptr, - enum rpcrdma_chunktype wtype) + struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) { + struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; - int n, nsegs, nchunks; + int nsegs, nchunks; __be32 *segcount; - if (wtype != rpcrdma_writech) { - *iptr++ = xdr_zero; /* no Write list present */ - return iptr; - } - seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, rqst->rq_rcv_buf.head[0].iov_len, wtype, seg); if (nsegs < 0) - return ERR_PTR(nsegs); + return nsegs; - *iptr++ = xdr_one; /* Write list present */ - segcount = iptr++; /* save location of segment count */ + if (encode_item_present(xdr) < 0) + return -EMSGSIZE; + segcount = xdr_reserve_space(xdr, sizeof(*segcount)); + if (unlikely(!segcount)) + return -EMSGSIZE; + /* Actual value encoded below */ nchunks = 0; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mw); - if (n < 0) - return ERR_PTR(n); + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mw); + if (IS_ERR(seg)) + return PTR_ERR(seg); rpcrdma_push_mw(mw, &req->rl_registered); - iptr = xdr_encode_rdma_segment(iptr, mw); + if (encode_rdma_segment(xdr, mw) < 0) + return -EMSGSIZE; dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, n < nsegs ? "more" : "last"); + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; nchunks++; - seg += n; - nsegs -= n; + nsegs -= mw->mw_nents; } while (nsegs); /* Update count of segments in this Write chunk */ *segcount = cpu_to_be32(nchunks); - /* Finish Write list */ - *iptr++ = xdr_zero; /* Next item not present */ - return iptr; + return 0; } -/* XDR-encode the Reply chunk. Supports encoding an array of plain - * segments that belong to a single write (reply) chunk. +/* Register and XDR encode the Reply chunk. Supports encoding an array + * of plain segments that belong to a single write (reply) chunk. * * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): * @@ -423,58 +458,57 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * N elements: * 1 - N - HLOO - HLOO - ... - HLOO * - * Returns a pointer to the XDR word in the RDMA header following - * the end of the Reply chunk, or an error pointer. + * Returns zero on success, or a negative errno if a failure occurred. + * @xdr is advanced to the next position in the stream. */ -static __be32 * -rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, struct rpc_rqst *rqst, - __be32 *iptr, enum rpcrdma_chunktype wtype) +static noinline int +rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) { + struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; - int n, nsegs, nchunks; + int nsegs, nchunks; __be32 *segcount; - if (wtype != rpcrdma_replych) { - *iptr++ = xdr_zero; /* no Reply chunk present */ - return iptr; - } - seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); if (nsegs < 0) - return ERR_PTR(nsegs); + return nsegs; - *iptr++ = xdr_one; /* Reply chunk present */ - segcount = iptr++; /* save location of segment count */ + if (encode_item_present(xdr) < 0) + return -EMSGSIZE; + segcount = xdr_reserve_space(xdr, sizeof(*segcount)); + if (unlikely(!segcount)) + return -EMSGSIZE; + /* Actual value encoded below */ nchunks = 0; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mw); - if (n < 0) - return ERR_PTR(n); + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mw); + if (IS_ERR(seg)) + return PTR_ERR(seg); rpcrdma_push_mw(mw, &req->rl_registered); - iptr = xdr_encode_rdma_segment(iptr, mw); + if (encode_rdma_segment(xdr, mw) < 0) + return -EMSGSIZE; dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, n < nsegs ? "more" : "last"); + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; nchunks++; - seg += n; - nsegs -= n; + nsegs -= mw->mw_nents; } while (nsegs); /* Update count of segments in the Reply chunk */ *segcount = cpu_to_be32(nchunks); - return iptr; + return 0; } /* Prepare the RPC-over-RDMA header SGE. @@ -651,37 +685,52 @@ rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req) req->rl_mapped_sges = 0; } -/* - * Marshal a request: the primary job of this routine is to choose - * the transfer modes. See comments below. +/** + * rpcrdma_marshal_req - Marshal and send one RPC request + * @r_xprt: controlling transport + * @rqst: RPC request to be marshaled + * + * For the RPC in "rqst", this function: + * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG) + * - Registers Read, Write, and Reply chunks + * - Constructs the transport header + * - Posts a Send WR to send the transport header and request * - * Returns zero on success, otherwise a negative errno. + * Returns: + * %0 if the RPC was sent successfully, + * %-ENOTCONN if the connection was lost, + * %-EAGAIN if not enough pages are available for on-demand reply buffer, + * %-ENOBUFS if no MRs are available to register chunks, + * %-EMSGSIZE if the transport header is too small, + * %-EIO if a permanent problem occurred while marshaling. */ - int -rpcrdma_marshal_req(struct rpc_rqst *rqst) +rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - struct rpc_xprt *xprt = rqst->rq_xprt; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct xdr_stream *xdr = &req->rl_stream; enum rpcrdma_chunktype rtype, wtype; - struct rpcrdma_msg *headerp; bool ddp_allowed; - ssize_t hdrlen; - size_t rpclen; - __be32 *iptr; + __be32 *p; + int ret; #if defined(CONFIG_SUNRPC_BACKCHANNEL) if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) return rpcrdma_bc_marshal_reply(rqst); #endif - headerp = rdmab_to_msg(req->rl_rdmabuf); - /* don't byte-swap XID, it's already done in request */ - headerp->rm_xid = rqst->rq_xid; - headerp->rm_vers = rpcrdma_version; - headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); - headerp->rm_type = rdma_msg; + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); + xdr_init_encode(xdr, &req->rl_hdrbuf, + req->rl_rdmabuf->rg_base); + + /* Fixed header fields */ + ret = -EMSGSIZE; + p = xdr_reserve_space(xdr, 4 * sizeof(*p)); + if (!p) + goto out_err; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); /* When the ULP employs a GSS flavor that guarantees integrity * or privacy, direct data placement of individual data items @@ -721,22 +770,17 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * by themselves are larger than the inline threshold. */ if (rpcrdma_args_inline(r_xprt, rqst)) { + *p++ = rdma_msg; rtype = rpcrdma_noch; - rpclen = rqst->rq_snd_buf.len; } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { + *p++ = rdma_msg; rtype = rpcrdma_readch; - rpclen = rqst->rq_snd_buf.head[0].iov_len + - rqst->rq_snd_buf.tail[0].iov_len; } else { r_xprt->rx_stats.nomsg_call_count++; - headerp->rm_type = htonl(RDMA_NOMSG); + *p++ = rdma_nomsg; rtype = rpcrdma_areadch; - rpclen = 0; } - req->rl_xid = rqst->rq_xid; - rpcrdma_insert_req(&r_xprt->rx_buf, req); - /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: * @@ -759,79 +803,50 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * send a Call message with a Position Zero Read chunk and a * regular Read chunk at the same time. */ - iptr = headerp->rm_body.rm_chunks; - iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); - if (IS_ERR(iptr)) + if (rtype != rpcrdma_noch) { + ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype); + if (ret) + goto out_err; + } + ret = encode_item_not_present(xdr); + if (ret) goto out_err; - iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype); - if (IS_ERR(iptr)) + + if (wtype == rpcrdma_writech) { + ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype); + if (ret) + goto out_err; + } + ret = encode_item_not_present(xdr); + if (ret) goto out_err; - iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype); - if (IS_ERR(iptr)) + + if (wtype != rpcrdma_replych) + ret = encode_item_not_present(xdr); + else + ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype); + if (ret) goto out_err; - hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; - dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", + dprintk("RPC: %5u %s: %s/%s: hdrlen %u rpclen\n", rqst->rq_task->tk_pid, __func__, transfertypes[rtype], transfertypes[wtype], - hdrlen, rpclen); + xdr_stream_pos(xdr)); - if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen, + if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, + xdr_stream_pos(xdr), &rqst->rq_snd_buf, rtype)) { - iptr = ERR_PTR(-EIO); + ret = -EIO; goto out_err; } return 0; out_err: - if (PTR_ERR(iptr) != -ENOBUFS) { - pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", - PTR_ERR(iptr)); + if (ret != -ENOBUFS) { + pr_err("rpcrdma: header marshaling failed (%d)\n", ret); r_xprt->rx_stats.failed_marshal_count++; } - return PTR_ERR(iptr); -} - -/* - * Chase down a received write or reply chunklist to get length - * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) - */ -static int -rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp) -{ - unsigned int i, total_len; - struct rpcrdma_write_chunk *cur_wchunk; - char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); - - i = be32_to_cpu(**iptrp); - cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); - total_len = 0; - while (i--) { - struct rpcrdma_segment *seg = &cur_wchunk->wc_target; - ifdebug(FACILITY) { - u64 off; - xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); - dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n", - __func__, - be32_to_cpu(seg->rs_length), - (unsigned long long)off, - be32_to_cpu(seg->rs_handle)); - } - total_len += be32_to_cpu(seg->rs_length); - ++cur_wchunk; - } - /* check and adjust for properly terminated write chunk */ - if (wrchunk) { - __be32 *w = (__be32 *) cur_wchunk; - if (*w++ != xdr_zero) - return -1; - cur_wchunk = (struct rpcrdma_write_chunk *) w; - } - if ((char *)cur_wchunk > base + rep->rr_len) - return -1; - - *iptrp = (__be32 *) cur_wchunk; - return total_len; + return ret; } /** @@ -949,37 +964,254 @@ rpcrdma_mark_remote_invalidation(struct list_head *mws, } } -#if defined(CONFIG_SUNRPC_BACKCHANNEL) /* By convention, backchannel calls arrive via rdma_msg type * messages, and never populate the chunk lists. This makes * the RPC/RDMA header small and fixed in size, so it is * straightforward to check the RPC header's direction field. */ static bool -rpcrdma_is_bcall(struct rpcrdma_msg *headerp) +rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, + __be32 xid, __be32 proc) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) { - __be32 *p = (__be32 *)headerp; + struct xdr_stream *xdr = &rep->rr_stream; + __be32 *p; - if (headerp->rm_type != rdma_msg) + if (proc != rdma_msg) return false; - if (headerp->rm_body.rm_chunks[0] != xdr_zero) + + /* Peek at stream contents without advancing. */ + p = xdr_inline_decode(xdr, 0); + + /* Chunk lists */ + if (*p++ != xdr_zero) return false; - if (headerp->rm_body.rm_chunks[1] != xdr_zero) + if (*p++ != xdr_zero) return false; - if (headerp->rm_body.rm_chunks[2] != xdr_zero) + if (*p++ != xdr_zero) return false; - /* sanity */ - if (p[7] != headerp->rm_xid) + /* RPC header */ + if (*p++ != xid) return false; - /* call direction */ - if (p[8] != cpu_to_be32(RPC_CALL)) + if (*p != cpu_to_be32(RPC_CALL)) return false; + /* Now that we are sure this is a backchannel call, + * advance to the RPC header. + */ + p = xdr_inline_decode(xdr, 3 * sizeof(*p)); + if (unlikely(!p)) + goto out_short; + + rpcrdma_bc_receive_call(r_xprt, rep); + return true; + +out_short: + pr_warn("RPC/RDMA short backward direction call\n"); + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) + xprt_disconnect_done(&r_xprt->rx_xprt); return true; } +#else /* CONFIG_SUNRPC_BACKCHANNEL */ +{ + return false; +} #endif /* CONFIG_SUNRPC_BACKCHANNEL */ +static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4 * sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + ifdebug(FACILITY) { + u64 offset; + u32 handle; + + handle = be32_to_cpup(p++); + *length = be32_to_cpup(p++); + xdr_decode_hyper(p, &offset); + dprintk("RPC: %s: segment %u@0x%016llx:0x%08x\n", + __func__, *length, (unsigned long long)offset, + handle); + } else { + *length = be32_to_cpup(p + 1); + } + + return 0; +} + +static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) +{ + u32 segcount, seglength; + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + *length = 0; + segcount = be32_to_cpup(p); + while (segcount--) { + if (decode_rdma_segment(xdr, &seglength)) + return -EIO; + *length += seglength; + } + + dprintk("RPC: %s: segcount=%u, %u bytes\n", + __func__, be32_to_cpup(p), *length); + return 0; +} + +/* In RPC-over-RDMA Version One replies, a Read list is never + * expected. This decoder is a stub that returns an error if + * a Read list is present. + */ +static int decode_read_list(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + if (unlikely(*p != xdr_zero)) + return -EIO; + return 0; +} + +/* Supports only one Write chunk in the Write list + */ +static int decode_write_list(struct xdr_stream *xdr, u32 *length) +{ + u32 chunklen; + bool first; + __be32 *p; + + *length = 0; + first = true; + do { + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + if (*p == xdr_zero) + break; + if (!first) + return -EIO; + + if (decode_write_chunk(xdr, &chunklen)) + return -EIO; + *length += chunklen; + first = false; + } while (true); + return 0; +} + +static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + *length = 0; + if (*p != xdr_zero) + if (decode_write_chunk(xdr, length)) + return -EIO; + return 0; +} + +static int +rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, + struct rpc_rqst *rqst) +{ + struct xdr_stream *xdr = &rep->rr_stream; + u32 writelist, replychunk, rpclen; + char *base; + + /* Decode the chunk lists */ + if (decode_read_list(xdr)) + return -EIO; + if (decode_write_list(xdr, &writelist)) + return -EIO; + if (decode_reply_chunk(xdr, &replychunk)) + return -EIO; + + /* RDMA_MSG sanity checks */ + if (unlikely(replychunk)) + return -EIO; + + /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */ + base = (char *)xdr_inline_decode(xdr, 0); + rpclen = xdr_stream_remaining(xdr); + r_xprt->rx_stats.fixup_copy_count += + rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3); + + r_xprt->rx_stats.total_rdma_reply += writelist; + return rpclen + xdr_align_size(writelist); +} + +static noinline int +rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) +{ + struct xdr_stream *xdr = &rep->rr_stream; + u32 writelist, replychunk; + + /* Decode the chunk lists */ + if (decode_read_list(xdr)) + return -EIO; + if (decode_write_list(xdr, &writelist)) + return -EIO; + if (decode_reply_chunk(xdr, &replychunk)) + return -EIO; + + /* RDMA_NOMSG sanity checks */ + if (unlikely(writelist)) + return -EIO; + if (unlikely(!replychunk)) + return -EIO; + + /* Reply chunk buffer already is the reply vector */ + r_xprt->rx_stats.total_rdma_reply += replychunk; + return replychunk; +} + +static noinline int +rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, + struct rpc_rqst *rqst) +{ + struct xdr_stream *xdr = &rep->rr_stream; + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + switch (*p) { + case err_vers: + p = xdr_inline_decode(xdr, 2 * sizeof(*p)); + if (!p) + break; + dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n", + rqst->rq_task->tk_pid, __func__, + be32_to_cpup(p), be32_to_cpu(*(p + 1))); + break; + case err_chunk: + dprintk("RPC: %5u: %s: server reports header decoding error\n", + rqst->rq_task->tk_pid, __func__); + break; + default: + dprintk("RPC: %5u: %s: server reports unrecognized error %d\n", + rqst->rq_task->tk_pid, __func__, be32_to_cpup(p)); + } + + r_xprt->rx_stats.bad_reply_count++; + return -EREMOTEIO; +} + /* Process received RPC/RDMA messages. * * Errors must result in the RPC task either being awakened, or @@ -991,51 +1223,48 @@ rpcrdma_reply_handler(struct work_struct *work) struct rpcrdma_rep *rep = container_of(work, struct rpcrdma_rep, rr_work); struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct rpcrdma_msg *headerp; + struct xdr_stream *xdr = &rep->rr_stream; struct rpcrdma_req *req; struct rpc_rqst *rqst; - __be32 *iptr; - int rdmalen, status, rmerr; + __be32 *p, xid, vers, proc; unsigned long cwnd; - struct list_head mws; + int status; dprintk("RPC: %s: incoming rep %p\n", __func__, rep); - if (rep->rr_len == RPCRDMA_BAD_LEN) + if (rep->rr_hdrbuf.head[0].iov_len == 0) goto out_badstatus; - if (rep->rr_len < RPCRDMA_HDRLEN_ERR) + + xdr_init_decode(xdr, &rep->rr_hdrbuf, + rep->rr_hdrbuf.head[0].iov_base); + + /* Fixed transport header fields */ + p = xdr_inline_decode(xdr, 4 * sizeof(*p)); + if (unlikely(!p)) goto out_shortreply; + xid = *p++; + vers = *p++; + p++; /* credits */ + proc = *p++; - headerp = rdmab_to_msg(rep->rr_rdmabuf); -#if defined(CONFIG_SUNRPC_BACKCHANNEL) - if (rpcrdma_is_bcall(headerp)) - goto out_bcall; -#endif + if (rpcrdma_is_bcall(r_xprt, rep, xid, proc)) + return; /* Match incoming rpcrdma_rep to an rpcrdma_req to * get context for handling any incoming chunks. */ - spin_lock(&buf->rb_lock); - req = rpcrdma_lookup_req_locked(&r_xprt->rx_buf, - headerp->rm_xid); - if (!req) - goto out_nomatch; - if (req->rl_reply) - goto out_duplicate; - - list_replace_init(&req->rl_registered, &mws); - rpcrdma_mark_remote_invalidation(&mws, rep); - - /* Avoid races with signals and duplicate replies - * by marking this req as matched. - */ + spin_lock(&xprt->recv_lock); + rqst = xprt_lookup_rqst(xprt, xid); + if (!rqst) + goto out_norqst; + xprt_pin_rqst(rqst); + spin_unlock(&xprt->recv_lock); + req = rpcr_to_rdmar(rqst); req->rl_reply = rep; - spin_unlock(&buf->rb_lock); dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", - __func__, rep, req, be32_to_cpu(headerp->rm_xid)); + __func__, rep, req, be32_to_cpu(xid)); /* Invalidate and unmap the data payloads before waking the * waiting application. This guarantees the memory regions @@ -1044,99 +1273,42 @@ rpcrdma_reply_handler(struct work_struct *work) * waking the next RPC waits until this RPC has relinquished * all its Send Queue entries. */ - if (!list_empty(&mws)) - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, &mws); + if (!list_empty(&req->rl_registered)) { + rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, + &req->rl_registered); + } - /* Perform XID lookup, reconstruction of the RPC reply, and - * RPC completion while holding the transport lock to ensure - * the rep, rqst, and rq_task pointers remain stable. - */ - spin_lock_bh(&xprt->transport_lock); - rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); - if (!rqst) - goto out_norqst; xprt->reestablish_timeout = 0; - if (headerp->rm_vers != rpcrdma_version) + if (vers != rpcrdma_version) goto out_badversion; - /* check for expected message types */ - /* The order of some of these tests is important. */ - switch (headerp->rm_type) { + switch (proc) { case rdma_msg: - /* never expect read chunks */ - /* never expect reply chunks (two ways to check) */ - if (headerp->rm_body.rm_chunks[0] != xdr_zero || - (headerp->rm_body.rm_chunks[1] == xdr_zero && - headerp->rm_body.rm_chunks[2] != xdr_zero)) - goto badheader; - if (headerp->rm_body.rm_chunks[1] != xdr_zero) { - /* count any expected write chunks in read reply */ - /* start at write chunk array count */ - iptr = &headerp->rm_body.rm_chunks[2]; - rdmalen = rpcrdma_count_chunks(rep, 1, &iptr); - /* check for validity, and no reply chunk after */ - if (rdmalen < 0 || *iptr++ != xdr_zero) - goto badheader; - rep->rr_len -= - ((unsigned char *)iptr - (unsigned char *)headerp); - status = rep->rr_len + rdmalen; - r_xprt->rx_stats.total_rdma_reply += rdmalen; - /* special case - last chunk may omit padding */ - if (rdmalen &= 3) { - rdmalen = 4 - rdmalen; - status += rdmalen; - } - } else { - /* else ordinary inline */ - rdmalen = 0; - iptr = (__be32 *)((unsigned char *)headerp + - RPCRDMA_HDRLEN_MIN); - rep->rr_len -= RPCRDMA_HDRLEN_MIN; - status = rep->rr_len; - } - - r_xprt->rx_stats.fixup_copy_count += - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, - rdmalen); + status = rpcrdma_decode_msg(r_xprt, rep, rqst); break; - case rdma_nomsg: - /* never expect read or write chunks, always reply chunks */ - if (headerp->rm_body.rm_chunks[0] != xdr_zero || - headerp->rm_body.rm_chunks[1] != xdr_zero || - headerp->rm_body.rm_chunks[2] != xdr_one) - goto badheader; - iptr = (__be32 *)((unsigned char *)headerp + - RPCRDMA_HDRLEN_MIN); - rdmalen = rpcrdma_count_chunks(rep, 0, &iptr); - if (rdmalen < 0) - goto badheader; - r_xprt->rx_stats.total_rdma_reply += rdmalen; - /* Reply chunk buffer already is the reply vector - no fixup. */ - status = rdmalen; + status = rpcrdma_decode_nomsg(r_xprt, rep); break; - case rdma_error: - goto out_rdmaerr; - -badheader: + status = rpcrdma_decode_error(r_xprt, rep, rqst); + break; default: - dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", - rqst->rq_task->tk_pid, __func__, - be32_to_cpu(headerp->rm_type)); status = -EIO; - r_xprt->rx_stats.bad_reply_count++; - break; } + if (status < 0) + goto out_badheader; out: + spin_lock(&xprt->recv_lock); cwnd = xprt->cwnd; xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) xprt_release_rqst_cong(rqst->rq_task); xprt_complete_rqst(rqst->rq_task, status); - spin_unlock_bh(&xprt->transport_lock); + xprt_unpin_rqst(rqst); + spin_unlock(&xprt->recv_lock); dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", __func__, xprt, rqst, status); return; @@ -1149,72 +1321,38 @@ out_badstatus: } return; -#if defined(CONFIG_SUNRPC_BACKCHANNEL) -out_bcall: - rpcrdma_bc_receive_call(r_xprt, rep); - return; -#endif - /* If the incoming reply terminated a pending RPC, the next * RPC call will post a replacement receive buffer as it is * being marshaled. */ out_badversion: dprintk("RPC: %s: invalid version %d\n", - __func__, be32_to_cpu(headerp->rm_vers)); + __func__, be32_to_cpu(vers)); status = -EIO; r_xprt->rx_stats.bad_reply_count++; goto out; -out_rdmaerr: - rmerr = be32_to_cpu(headerp->rm_body.rm_error.rm_err); - switch (rmerr) { - case ERR_VERS: - pr_err("%s: server reports header version error (%u-%u)\n", - __func__, - be32_to_cpu(headerp->rm_body.rm_error.rm_vers_low), - be32_to_cpu(headerp->rm_body.rm_error.rm_vers_high)); - break; - case ERR_CHUNK: - pr_err("%s: server reports header decoding error\n", - __func__); - break; - default: - pr_err("%s: server reports unknown error %d\n", - __func__, rmerr); - } - status = -EREMOTEIO; +out_badheader: + dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", + rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc)); r_xprt->rx_stats.bad_reply_count++; + status = -EIO; goto out; -/* The req was still available, but by the time the transport_lock +/* The req was still available, but by the time the recv_lock * was acquired, the rqst and task had been released. Thus the RPC * has already been terminated. */ out_norqst: - spin_unlock_bh(&xprt->transport_lock); - rpcrdma_buffer_put(req); - dprintk("RPC: %s: race, no rqst left for req %p\n", - __func__, req); - return; + spin_unlock(&xprt->recv_lock); + dprintk("RPC: %s: no match for incoming xid 0x%08x\n", + __func__, be32_to_cpu(xid)); + goto repost; out_shortreply: dprintk("RPC: %s: short/invalid reply\n", __func__); goto repost; -out_nomatch: - spin_unlock(&buf->rb_lock); - dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n", - __func__, be32_to_cpu(headerp->rm_xid), - rep->rr_len); - goto repost; - -out_duplicate: - spin_unlock(&buf->rb_lock); - dprintk("RPC: %s: " - "duplicate reply %p to RPC request %p: xid 0x%08x\n", - __func__, rep, req, be32_to_cpu(headerp->rm_xid)); - /* If no pending RPC transaction was matched, post a replacement * receive buffer before returning. */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index c676ed0efb5a..ec37ad83b068 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -52,7 +52,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, if (src->iov_len < 24) goto out_shortreply; - spin_lock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); req = xprt_lookup_rqst(xprt, xid); if (!req) goto out_notfound; @@ -69,17 +69,20 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, else if (credits > r_xprt->rx_buf.rb_bc_max_requests) credits = r_xprt->rx_buf.rb_bc_max_requests; + spin_lock_bh(&xprt->transport_lock); cwnd = xprt->cwnd; xprt->cwnd = credits << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) xprt_release_rqst_cong(req->rq_task); + spin_unlock_bh(&xprt->transport_lock); + ret = 0; xprt_complete_rqst(req->rq_task, rcvbuf->len); rcvbuf->len = 0; out_unlock: - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); out: return ret; @@ -266,7 +269,7 @@ xprt_rdma_bc_put(struct rpc_xprt *xprt) module_put(THIS_MODULE); } -static struct rpc_xprt_ops xprt_rdma_bc_procs = { +static const struct rpc_xprt_ops xprt_rdma_bc_procs = { .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, .alloc_slot = xprt_alloc_slot, diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index d1c458e5ec4d..c84e2b644e13 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -149,7 +149,7 @@ static struct ctl_table sunrpc_table[] = { #endif -static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */ +static const struct rpc_xprt_ops xprt_rdma_procs; static void xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) @@ -559,6 +559,7 @@ rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, r_xprt->rx_stats.hardway_register_count += size; req->rl_rdmabuf = rb; + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); return true; } @@ -684,7 +685,6 @@ xprt_rdma_free(struct rpc_task *task) dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - rpcrdma_remove_req(&r_xprt->rx_buf, req); if (!list_empty(&req->rl_registered)) ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task)); rpcrdma_unmap_sges(ia, req); @@ -730,7 +730,7 @@ xprt_rdma_send_request(struct rpc_task *task) if (unlikely(!list_empty(&req->rl_registered))) r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); - rc = rpcrdma_marshal_req(rqst); + rc = rpcrdma_marshal_req(r_xprt, rqst); if (rc < 0) goto failed_marshal; @@ -811,7 +811,7 @@ xprt_rdma_disable_swap(struct rpc_xprt *xprt) * Plumbing for rpc transport switch and kernel module */ -static struct rpc_xprt_ops xprt_rdma_procs = { +static const struct rpc_xprt_ops xprt_rdma_procs = { .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ .alloc_slot = xprt_alloc_slot, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e4171f2abe37..11a1fbf7e59e 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -139,14 +139,11 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) static void rpcrdma_update_granted_credits(struct rpcrdma_rep *rep) { - struct rpcrdma_msg *rmsgp = rdmab_to_msg(rep->rr_rdmabuf); struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf; + __be32 *p = rep->rr_rdmabuf->rg_base; u32 credits; - if (rep->rr_len < RPCRDMA_HDRLEN_ERR) - return; - - credits = be32_to_cpu(rmsgp->rm_credit); + credits = be32_to_cpup(p + 2); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > buffer->rb_max_requests) @@ -173,21 +170,19 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) goto out_fail; /* status == SUCCESS means all fields in wc are trustworthy */ - if (wc->opcode != IB_WC_RECV) - return; - dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n", __func__, rep, wc->byte_len); - rep->rr_len = wc->byte_len; + rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len); rep->rr_wc_flags = wc->wc_flags; rep->rr_inv_rkey = wc->ex.invalidate_rkey; ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf), rdmab_addr(rep->rr_rdmabuf), - rep->rr_len, DMA_FROM_DEVICE); + wc->byte_len, DMA_FROM_DEVICE); - rpcrdma_update_granted_credits(rep); + if (wc->byte_len >= RPCRDMA_HDRLEN_ERR) + rpcrdma_update_granted_credits(rep); out_schedule: queue_work(rpcrdma_receive_wq, &rep->rr_work); @@ -198,7 +193,7 @@ out_fail: pr_err("rpcrdma: Recv: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), wc->status, wc->vendor_err); - rep->rr_len = RPCRDMA_BAD_LEN; + rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0); goto out_schedule; } @@ -974,6 +969,8 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) rc = PTR_ERR(rep->rr_rdmabuf); goto out_free; } + xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base, + rdmab_length(rep->rr_rdmabuf)); rep->rr_cqe.done = rpcrdma_wc_receive; rep->rr_rxprt = r_xprt; @@ -1004,7 +1001,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) spin_lock_init(&buf->rb_recovery_lock); INIT_LIST_HEAD(&buf->rb_mws); INIT_LIST_HEAD(&buf->rb_all); - INIT_LIST_HEAD(&buf->rb_pending); INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index b282d3f8cdd8..e26a97d2f922 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -218,18 +218,17 @@ enum { struct rpcrdma_rep { struct ib_cqe rr_cqe; - unsigned int rr_len; int rr_wc_flags; u32 rr_inv_rkey; + struct rpcrdma_regbuf *rr_rdmabuf; struct rpcrdma_xprt *rr_rxprt; struct work_struct rr_work; + struct xdr_buf rr_hdrbuf; + struct xdr_stream rr_stream; struct list_head rr_list; struct ib_recv_wr rr_recv_wr; - struct rpcrdma_regbuf *rr_rdmabuf; }; -#define RPCRDMA_BAD_LEN (~0U) - /* * struct rpcrdma_mw - external memory region metadata * @@ -341,11 +340,12 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; - __be32 rl_xid; unsigned int rl_mapped_sges; unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; + struct xdr_stream rl_stream; + struct xdr_buf rl_hdrbuf; struct ib_send_wr rl_send_wr; struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES]; struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ @@ -403,7 +403,6 @@ struct rpcrdma_buffer { int rb_send_count, rb_recv_count; struct list_head rb_send_bufs; struct list_head rb_recv_bufs; - struct list_head rb_pending; u32 rb_max_requests; atomic_t rb_credits; /* most recent credit grant */ @@ -440,24 +439,27 @@ struct rpcrdma_create_data_internal { * Statistics for RPCRDMA */ struct rpcrdma_stats { + /* accessed when sending a call */ unsigned long read_chunk_count; unsigned long write_chunk_count; unsigned long reply_chunk_count; - unsigned long long total_rdma_request; - unsigned long long total_rdma_reply; + /* rarely accessed error counters */ unsigned long long pullup_copy_count; - unsigned long long fixup_copy_count; unsigned long hardway_register_count; unsigned long failed_marshal_count; unsigned long bad_reply_count; - unsigned long nomsg_call_count; - unsigned long bcall_count; unsigned long mrs_recovered; unsigned long mrs_orphaned; unsigned long mrs_allocated; + + /* accessed when receiving a reply */ + unsigned long long total_rdma_reply; + unsigned long long fixup_copy_count; unsigned long local_inv_needed; + unsigned long nomsg_call_count; + unsigned long bcall_count; }; /* @@ -465,7 +467,8 @@ struct rpcrdma_stats { */ struct rpcrdma_xprt; struct rpcrdma_memreg_ops { - int (*ro_map)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg * + (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool, struct rpcrdma_mw **); void (*ro_unmap_sync)(struct rpcrdma_xprt *, @@ -552,34 +555,6 @@ void rpcrdma_destroy_req(struct rpcrdma_req *); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); -static inline void -rpcrdma_insert_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) -{ - spin_lock(&buffers->rb_lock); - if (list_empty(&req->rl_list)) - list_add_tail(&req->rl_list, &buffers->rb_pending); - spin_unlock(&buffers->rb_lock); -} - -static inline struct rpcrdma_req * -rpcrdma_lookup_req_locked(struct rpcrdma_buffer *buffers, __be32 xid) -{ - struct rpcrdma_req *pos; - - list_for_each_entry(pos, &buffers->rb_pending, rl_list) - if (pos->rl_xid == xid) - return pos; - return NULL; -} - -static inline void -rpcrdma_remove_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) -{ - spin_lock(&buffers->rb_lock); - list_del(&req->rl_list); - spin_unlock(&buffers->rb_lock); -} - struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); @@ -638,10 +613,16 @@ enum rpcrdma_chunktype { bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *, u32, struct xdr_buf *, enum rpcrdma_chunktype); void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); -int rpcrdma_marshal_req(struct rpc_rqst *); +int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); void rpcrdma_reply_handler(struct work_struct *work); +static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) +{ + xdr->head[0].iov_len = len; + xdr->len = len; +} + /* RPC/RDMA module init - xprtrdma/transport.c */ extern unsigned int xprt_rdma_max_inline_read; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 4f154d388748..9b5de31aa429 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -969,10 +969,12 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt, return; /* Look up and lock the request corresponding to the given XID */ - spin_lock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; + xprt_pin_rqst(rovr); + spin_unlock(&xprt->recv_lock); task = rovr->rq_task; copied = rovr->rq_private_buf.buflen; @@ -981,13 +983,16 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt, if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) { dprintk("RPC: sk_buff copy failed\n"); - goto out_unlock; + spin_lock(&xprt->recv_lock); + goto out_unpin; } + spin_lock(&xprt->recv_lock); xprt_complete_rqst(task, copied); - +out_unpin: + xprt_unpin_rqst(rovr); out_unlock: - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); } static void xs_local_data_receive(struct sock_xprt *transport) @@ -1050,10 +1055,12 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, return; /* Look up and lock the request corresponding to the given XID */ - spin_lock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; + xprt_pin_rqst(rovr); + spin_unlock(&xprt->recv_lock); task = rovr->rq_task; if ((copied = rovr->rq_private_buf.buflen) > repsize) @@ -1062,16 +1069,21 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, /* Suck it into the iovec, verify checksum if not done by hw. */ if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { __UDPX_INC_STATS(sk, UDP_MIB_INERRORS); - goto out_unlock; + spin_lock(&xprt->recv_lock); + goto out_unpin; } __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS); + spin_lock_bh(&xprt->transport_lock); xprt_adjust_cwnd(xprt, task, copied); + spin_unlock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); xprt_complete_rqst(task, copied); - +out_unpin: + xprt_unpin_rqst(rovr); out_unlock: - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); } static void xs_udp_data_receive(struct sock_xprt *transport) @@ -1277,25 +1289,12 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt, } len = desc->count; - if (len > transport->tcp_reclen - transport->tcp_offset) { - struct xdr_skb_reader my_desc; - - len = transport->tcp_reclen - transport->tcp_offset; - memcpy(&my_desc, desc, sizeof(my_desc)); - my_desc.count = len; - r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, - &my_desc, xdr_skb_read_bits); - desc->count -= r; - desc->offset += r; - } else - r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, + if (len > transport->tcp_reclen - transport->tcp_offset) + desc->count = transport->tcp_reclen - transport->tcp_offset; + r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, desc, xdr_skb_read_bits); - if (r > 0) { - transport->tcp_copied += r; - transport->tcp_offset += r; - } - if (r != len) { + if (desc->count) { /* Error when copying to the receive buffer, * usually because we weren't able to allocate * additional buffer pages. All we can do now @@ -1315,6 +1314,10 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt, return; } + transport->tcp_copied += r; + transport->tcp_offset += r; + desc->count = len - r; + dprintk("RPC: XID %08x read %zd bytes\n", ntohl(transport->tcp_xid), r); dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, " @@ -1343,21 +1346,24 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid)); /* Find and lock the request corresponding to this xid */ - spin_lock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); req = xprt_lookup_rqst(xprt, transport->tcp_xid); if (!req) { dprintk("RPC: XID %08x request not found!\n", ntohl(transport->tcp_xid)); - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); return -1; } + xprt_pin_rqst(req); + spin_unlock(&xprt->recv_lock); xs_tcp_read_common(xprt, desc, req); + spin_lock(&xprt->recv_lock); if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) xprt_complete_rqst(req->rq_task, transport->tcp_copied); - - spin_unlock_bh(&xprt->transport_lock); + xprt_unpin_rqst(req); + spin_unlock(&xprt->recv_lock); return 0; } @@ -1376,11 +1382,9 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt, container_of(xprt, struct sock_xprt, xprt); struct rpc_rqst *req; - /* Look up and lock the request corresponding to the given XID */ - spin_lock_bh(&xprt->transport_lock); + /* Look up the request corresponding to the given XID */ req = xprt_lookup_bc_request(xprt, transport->tcp_xid); if (req == NULL) { - spin_unlock_bh(&xprt->transport_lock); printk(KERN_WARNING "Callback slot table overflowed\n"); xprt_force_disconnect(xprt); return -1; @@ -1391,7 +1395,6 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt, if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) xprt_complete_bc_request(req, transport->tcp_copied); - spin_unlock_bh(&xprt->transport_lock); return 0; } @@ -1516,6 +1519,7 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) .arg.data = xprt, }; unsigned long total = 0; + int loop; int read = 0; mutex_lock(&transport->recv_mutex); @@ -1524,20 +1528,20 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) goto out; /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ - for (;;) { + for (loop = 0; loop < 64; loop++) { lock_sock(sk); read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); if (read <= 0) { clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); release_sock(sk); - if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) - break; - } else { - release_sock(sk); - total += read; + break; } + release_sock(sk); + total += read; rd_desc.count = 65536; } + if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) + queue_work(xprtiod_workqueue, &transport->recv_worker); out: mutex_unlock(&transport->recv_mutex); trace_xs_tcp_data_ready(xprt, read, total); @@ -2724,7 +2728,7 @@ static void bc_destroy(struct rpc_xprt *xprt) module_put(THIS_MODULE); } -static struct rpc_xprt_ops xs_local_ops = { +static const struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, .alloc_slot = xprt_alloc_slot, @@ -2742,7 +2746,7 @@ static struct rpc_xprt_ops xs_local_ops = { .disable_swap = xs_disable_swap, }; -static struct rpc_xprt_ops xs_udp_ops = { +static const struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, @@ -2764,7 +2768,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .inject_disconnect = xs_inject_disconnect, }; -static struct rpc_xprt_ops xs_tcp_ops = { +static const struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, .alloc_slot = xprt_lock_and_alloc_slot, @@ -2795,7 +2799,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { * The rpc_xprt_ops for the server backchannel */ -static struct rpc_xprt_ops bc_tcp_ops = { +static const struct rpc_xprt_ops bc_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 7d99029df342..a140dd4a84af 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -233,7 +233,7 @@ static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, struct sk_buff_head xmitq; int rc = 0; - __skb_queue_head_init(&xmitq); + skb_queue_head_init(&xmitq); tipc_bcast_lock(net); if (tipc_link_bc_peers(l)) rc = tipc_link_xmit(l, pkts, &xmitq); @@ -263,7 +263,7 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, u32 dst, selector; selector = msg_link_selector(buf_msg(skb_peek(pkts))); - __skb_queue_head_init(&_pkts); + skb_queue_head_init(&_pkts); list_for_each_entry_safe(n, tmp, &dests->list, list) { dst = n->value; diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 6ef379f004ac..17146c16ee2d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -551,7 +551,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) return false; if (msg_errcode(msg)) return false; - *err = -TIPC_ERR_NO_NAME; + *err = TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; msg = buf_msg(skb); @@ -568,6 +568,14 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) msg_set_destnode(msg, dnode); msg_set_destport(msg, dport); *err = TIPC_OK; + + if (!skb_cloned(skb)) + return true; + + /* Unclone buffer in case it was bundled */ + if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC)) + return false; + return true; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index fa596fa71ba7..7d80040a37b6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -639,7 +639,7 @@ sendpage_end: return ret; } -void tls_sw_free_resources(struct sock *sk) +static void tls_sw_free_resources(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0df8023f480b..690874293cfc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9987,6 +9987,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!setup.chandef.chan) + return -EINVAL; + err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band, &setup.beacon_rate); if (err) @@ -10903,6 +10906,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!tb[NL80211_REKEY_DATA_REPLAY_CTR] || !tb[NL80211_REKEY_DATA_KEK] || + !tb[NL80211_REKEY_DATA_KCK]) + return -EINVAL; if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN) return -ERANGE; if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN) |