From c0adf54a10903b59037a4c5fcb933dfeeb7b2624 Mon Sep 17 00:00:00 2001 From: shamir rabinovitch Date: Thu, 30 Apr 2015 20:58:07 -0400 Subject: net/rds: fix unaligned memory access rdma_conn_param private data is copied using memcpy after headers such as cma_hdr (see cma_resolve_ib_udp as example). so the start of the private data is aligned to the end of the structure that come before. if this structure end with u32 the meaning is that the start of the private data will be 4 bytes aligned. structures that use u8/u16/u32/u64 are naturally aligned but in case the structure start is not 8 bytes aligned, all u64 members of this structure will not be aligned. to solve this issue we must use special macros that allow unaligned access to those unaligned members. Addresses the following kernel log seen when attempting to use RDMA: Kernel unaligned access at TPC[10507a88] rds_ib_cm_connect_complete+0x1bc/0x1e0 [rds_rdma] Acked-by: Chien Yen Signed-off-by: shamir rabinovitch [Minor tweaks for top of tree by:] Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/rds/ib_cm.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net/rds') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 31b74f5e61ad..29144a60019f 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -183,8 +183,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ - if (dp && dp->dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + if (dp) { + /* dp structure start is not guaranteed to be 8 bytes aligned. + * Since dp_ack_seq is 64-bit extended load operations can be + * used so go through get_unaligned to avoid unaligned errors. + */ + u64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq); + + if (dp_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq), + NULL); + } rds_connect_complete(conn); } -- cgit v1.2.3 From e2783717a71e9babfdd7c36c7e35b790d2c01022 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 4 May 2015 11:51:38 -0400 Subject: net/rds: Fix new sparse warning c0adf54a109 introduced new sparse warnings: CHECK /home/dahern/kernels/linux.git/net/rds/ib_cm.c net/rds/ib_cm.c:191:34: warning: incorrect type in initializer (different base types) net/rds/ib_cm.c:191:34: expected unsigned long long [unsigned] [usertype] dp_ack_seq net/rds/ib_cm.c:191:34: got restricted __be64 net/rds/ib_cm.c:194:51: warning: cast to restricted __be64 The temporary variable for sequence number should have been declared as __be64 rather than u64. Make it so. Signed-off-by: David Ahern Cc: shamir rabinovitch Signed-off-by: David S. Miller --- net/rds/ib_cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/rds') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 29144a60019f..8a09ee7db3c1 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -188,7 +188,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even * Since dp_ack_seq is 64-bit extended load operations can be * used so go through get_unaligned to avoid unaligned errors. */ - u64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq); + __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq); if (dp_ack_seq) rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq), -- cgit v1.2.3 From f711a6ae062caeee46067b2f2f12ffda319ae73c Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 5 May 2015 15:20:51 -0400 Subject: net/rds: RDS-TCP: Always create a new rds_sock for an incoming connection. When running RDS over TCP, the active (client) side connects to the listening ("passive") side at the RDS_TCP_PORT. After the connection is established, if the client side reboots (potentially without even sending a FIN) the server still has a TCP socket in the esablished state. If the server-side now gets a new SYN comes from the client with a different client port, TCP will create a new socket-pair, but the RDS layer will incorrectly pull up the old rds_connection (which is still associated with the stale t_sock and RDS socket state). This patch corrects this behavior by having rds_tcp_accept_one() always create a new connection for an incoming TCP SYN. The rds and tcp state associated with the old socket-pair is cleaned up via the rds_tcp_state_change() callback which would typically be invoked in most cases when the client-TCP sends a FIN on TCP restart, triggering a transition to CLOSE_WAIT state. In the rarer event of client death without a FIN, TCP_KEEPALIVE probes on the socket will detect the stale socket, and the TCP transition to CLOSE state will trigger the RDS state cleanup. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 4 ++++ net/rds/tcp_connect.c | 1 + net/rds/tcp_listen.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) (limited to 'net/rds') diff --git a/net/rds/connection.c b/net/rds/connection.c index 14f041398ca1..60f0cd6ed15f 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -126,7 +126,10 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *loop_trans; unsigned long flags; int ret; + struct rds_transport *otrans = trans; + if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) + goto new_conn; rcu_read_lock(); conn = rds_conn_lookup(head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && @@ -142,6 +145,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, if (conn) goto out; +new_conn: conn = kmem_cache_zalloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index f9f564a6c960..973109c7b8e8 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -62,6 +62,7 @@ void rds_tcp_state_change(struct sock *sk) case TCP_ESTABLISHED: rds_connect_complete(conn); break; + case TCP_CLOSE_WAIT: case TCP_CLOSE: rds_conn_drop(conn); default: diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 23ab4dcd1d9f..0da49e34495f 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -45,12 +45,45 @@ static void rds_tcp_accept_worker(struct work_struct *work); static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); static struct socket *rds_tcp_listen_sock; +static int rds_tcp_keepalive(struct socket *sock) +{ + /* values below based on xs_udp_default_timeout */ + int keepidle = 5; /* send a probe 'keepidle' secs after last data */ + int keepcnt = 5; /* number of unack'ed probes before declaring dead */ + int keepalive = 1; + int ret = 0; + + ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&keepalive, sizeof(keepalive)); + if (ret < 0) + goto bail; + + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, + (char *)&keepcnt, sizeof(keepcnt)); + if (ret < 0) + goto bail; + + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, + (char *)&keepidle, sizeof(keepidle)); + if (ret < 0) + goto bail; + + /* KEEPINTVL is the interval between successive probes. We follow + * the model in xs_tcp_finish_connecting() and re-use keepidle. + */ + ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, + (char *)&keepidle, sizeof(keepidle)); +bail: + return ret; +} + static int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; struct rds_connection *conn; int ret; struct inet_sock *inet; + struct rds_tcp_connection *rs_tcp; ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, &new_sock); @@ -63,6 +96,10 @@ static int rds_tcp_accept_one(struct socket *sock) if (ret < 0) goto out; + ret = rds_tcp_keepalive(new_sock); + if (ret < 0) + goto out; + rds_tcp_tune(new_sock); inet = inet_sk(new_sock->sk); @@ -77,6 +114,15 @@ static int rds_tcp_accept_one(struct socket *sock) ret = PTR_ERR(conn); goto out; } + /* An incoming SYN request came in, and TCP just accepted it. + * We always create a new conn for listen side of TCP, and do not + * add it to the c_hash_list. + * + * If the client reboots, this conn will need to be cleaned up. + * rds_tcp_state_change() will do that cleanup + */ + rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; + WARN_ON(!rs_tcp || rs_tcp->t_sock); /* * see the comment above rds_queue_delayed_reconnect() -- cgit v1.2.3 From c82ac7e69efe6dbe370d6ba84e2666d7692ef1c2 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 5 May 2015 15:20:52 -0400 Subject: net/rds: RDS-TCP: only initiate reconnect attempt on outgoing TCP socket. When the peer of an RDS-TCP connection restarts, a reconnect attempt should only be made from the active side of the TCP connection, i.e. the side that has a transient TCP port number. Do not add the passive side of the TCP connection to the c_hash_node and thus avoid triggering rds_queue_reconnect() for passive rds connections. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net/rds') diff --git a/net/rds/connection.c b/net/rds/connection.c index 60f0cd6ed15f..da6da57e5f36 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -234,13 +234,22 @@ new_conn: /* Creating normal conn */ struct rds_connection *found; - found = rds_conn_lookup(head, laddr, faddr, trans); + if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP) + found = NULL; + else + found = rds_conn_lookup(head, laddr, faddr, trans); if (found) { trans->conn_free(conn->c_transport_data); kmem_cache_free(rds_conn_slab, conn); conn = found; } else { - hlist_add_head_rcu(&conn->c_hash_node, head); + if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) || + (otrans->t_type != RDS_TRANS_TCP)) { + /* Only the active side should be added to + * reconnect list for TCP. + */ + hlist_add_head_rcu(&conn->c_hash_node, head); + } rds_cong_add_conn(conn); rds_conn_count++; } -- cgit v1.2.3