From 71c87e0cedca843162206c698cfa02e5fea9e2e3 Mon Sep 17 00:00:00 2001 From: Jan-Bernd Themann Date: Wed, 8 Aug 2007 22:38:05 -0700 Subject: [NET]: Generic Large Receive Offload for TCP traffic This patch provides generic Large Receive Offload (LRO) functionality for IPv4/TCP traffic. LRO combines received tcp packets to a single larger tcp packet and passes them then to the network stack in order to increase performance (throughput). The interface supports two modes: Drivers can either pass SKBs or fragment lists to the LRO engine. Signed-off-by: Jan-Bernd Themann Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 8 + net/ipv4/Makefile | 1 + net/ipv4/inet_lro.c | 600 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 609 insertions(+) create mode 100644 net/ipv4/inet_lro.c (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index fb7909774254..d894f616c3d6 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -394,6 +394,14 @@ config INET_XFRM_MODE_BEET If unsure, say Y. +config INET_LRO + tristate "Large Receive Offload (ipv4/tcp)" + + ---help--- + Support for Large Receive Offload (ipv4/tcp). + + If unsure, say Y. + config INET_DIAG tristate "INET: socket monitoring interface" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index fbf1674e0c2a..a02c36d0a13e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o +obj-$(CONFIG_INET_LRO) += inet_lro.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c new file mode 100644 index 000000000000..20bc593bb963 --- /dev/null +++ b/net/ipv4/inet_lro.c @@ -0,0 +1,600 @@ +/* + * linux/net/ipv4/inet_lro.c + * + * Large Receive Offload (ipv4 / tcp) + * + * (C) Copyright IBM Corp. 2007 + * + * Authors: + * Jan-Bernd Themann + * Christoph Raisch + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jan-Bernd Themann "); +MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); + +#define TCP_HDR_LEN(tcph) (tcph->doff << 2) +#define IP_HDR_LEN(iph) (iph->ihl << 2) +#define TCP_PAYLOAD_LENGTH(iph, tcph) \ + (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph)) + +#define IPH_LEN_WO_OPTIONS 5 +#define TCPH_LEN_WO_OPTIONS 5 +#define TCPH_LEN_W_TIMESTAMP 8 + +#define LRO_MAX_PG_HLEN 64 + +#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; } + +/* + * Basic tcp checks whether packet is suitable for LRO + */ + +static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph, + int len, struct net_lro_desc *lro_desc) +{ + /* check ip header: don't aggregate padded frames */ + if (ntohs(iph->tot_len) != len) + return -1; + + if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0) + return -1; + + if (iph->ihl != IPH_LEN_WO_OPTIONS) + return -1; + + if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack + || tcph->rst || tcph->syn || tcph->fin) + return -1; + + if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) + return -1; + + if (tcph->doff != TCPH_LEN_WO_OPTIONS + && tcph->doff != TCPH_LEN_W_TIMESTAMP) + return -1; + + /* check tcp options (only timestamp allowed) */ + if (tcph->doff == TCPH_LEN_W_TIMESTAMP) { + u32 *topt = (u32 *)(tcph + 1); + + if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return -1; + + /* timestamp should be in right order */ + topt++; + if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval), + ntohl(*topt))) + return -1; + + /* timestamp reply should not be zero */ + topt++; + if (*topt == 0) + return -1; + } + + return 0; +} + +static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) +{ + struct iphdr *iph = lro_desc->iph; + struct tcphdr *tcph = lro_desc->tcph; + u32 *p; + __wsum tcp_hdr_csum; + + tcph->ack_seq = lro_desc->tcp_ack; + tcph->window = lro_desc->tcp_window; + + if (lro_desc->tcp_saw_tstamp) { + p = (u32 *)(tcph + 1); + *(p+2) = lro_desc->tcp_rcv_tsecr; + } + + iph->tot_len = htons(lro_desc->ip_tot_len); + + iph->check = 0; + iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); + + tcph->check = 0; + tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), 0); + lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); + tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + lro_desc->ip_tot_len - + IP_HDR_LEN(iph), IPPROTO_TCP, + lro_desc->data_csum); +} + +static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len) +{ + __wsum tcp_csum; + __wsum tcp_hdr_csum; + __wsum tcp_ps_hdr_csum; + + tcp_csum = ~csum_unfold(tcph->check); + tcp_hdr_csum = csum_partial((u8 *)tcph, TCP_HDR_LEN(tcph), tcp_csum); + + tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, + len + TCP_HDR_LEN(tcph), + IPPROTO_TCP, 0); + + return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum), + tcp_ps_hdr_csum); +} + +static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, + struct iphdr *iph, struct tcphdr *tcph, + u16 vlan_tag, struct vlan_group *vgrp) +{ + int nr_frags; + u32 *ptr; + u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + nr_frags = skb_shinfo(skb)->nr_frags; + lro_desc->parent = skb; + lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]); + lro_desc->iph = iph; + lro_desc->tcph = tcph; + lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len; + lro_desc->tcp_ack = ntohl(tcph->ack_seq); + lro_desc->tcp_window = tcph->window; + + lro_desc->pkt_aggr_cnt = 1; + lro_desc->ip_tot_len = ntohs(iph->tot_len); + + if (tcph->doff == 8) { + ptr = (u32 *)(tcph+1); + lro_desc->tcp_saw_tstamp = 1; + lro_desc->tcp_rcv_tsval = *(ptr+1); + lro_desc->tcp_rcv_tsecr = *(ptr+2); + } + + lro_desc->mss = tcp_data_len; + lro_desc->vgrp = vgrp; + lro_desc->vlan_tag = vlan_tag; + lro_desc->active = 1; + + lro_desc->data_csum = lro_tcp_data_csum(iph, tcph, + tcp_data_len); +} + +static inline void lro_clear_desc(struct net_lro_desc *lro_desc) +{ + memset(lro_desc, 0, sizeof(struct net_lro_desc)); +} + +static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph, + struct tcphdr *tcph, int tcp_data_len) +{ + struct sk_buff *parent = lro_desc->parent; + u32 *topt; + + lro_desc->pkt_aggr_cnt++; + lro_desc->ip_tot_len += tcp_data_len; + lro_desc->tcp_next_seq += tcp_data_len; + lro_desc->tcp_window = tcph->window; + lro_desc->tcp_ack = tcph->ack_seq; + + /* don't update tcp_rcv_tsval, would not work with PAWS */ + if (lro_desc->tcp_saw_tstamp) { + topt = (u32 *) (tcph + 1); + lro_desc->tcp_rcv_tsecr = *(topt + 2); + } + + lro_desc->data_csum = csum_block_add(lro_desc->data_csum, + lro_tcp_data_csum(iph, tcph, + tcp_data_len), + parent->len); + + parent->len += tcp_data_len; + parent->data_len += tcp_data_len; + if (tcp_data_len > lro_desc->mss) + lro_desc->mss = tcp_data_len; +} + +static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, + struct iphdr *iph, struct tcphdr *tcph) +{ + struct sk_buff *parent = lro_desc->parent; + int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + lro_add_common(lro_desc, iph, tcph, tcp_data_len); + + skb_pull(skb, (skb->len - tcp_data_len)); + parent->truesize += skb->truesize; + + if (lro_desc->last_skb) + lro_desc->last_skb->next = skb; + else + skb_shinfo(parent)->frag_list = skb; + + lro_desc->last_skb = skb; +} + +static void lro_add_frags(struct net_lro_desc *lro_desc, + int len, int hlen, int truesize, + struct skb_frag_struct *skb_frags, + struct iphdr *iph, struct tcphdr *tcph) +{ + struct sk_buff *skb = lro_desc->parent; + int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + lro_add_common(lro_desc, iph, tcph, tcp_data_len); + + skb->truesize += truesize; + + skb_frags[0].page_offset += hlen; + skb_frags[0].size -= hlen; + + while (tcp_data_len > 0) { + *(lro_desc->next_frag) = *skb_frags; + tcp_data_len -= skb_frags->size; + lro_desc->next_frag++; + skb_frags++; + skb_shinfo(skb)->nr_frags++; + } +} + +static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, + struct iphdr *iph, + struct tcphdr *tcph) +{ + if ((lro_desc->iph->saddr != iph->saddr) + || (lro_desc->iph->daddr != iph->daddr) + || (lro_desc->tcph->source != tcph->source) + || (lro_desc->tcph->dest != tcph->dest)) + return -1; + return 0; +} + +static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr, + struct net_lro_desc *lro_arr, + struct iphdr *iph, + struct tcphdr *tcph) +{ + struct net_lro_desc *lro_desc = NULL; + struct net_lro_desc *tmp; + int max_desc = lro_mgr->max_desc; + int i; + + for (i = 0; i < max_desc; i++) { + tmp = &lro_arr[i]; + if (tmp->active) + if (!lro_check_tcp_conn(tmp, iph, tcph)) { + lro_desc = tmp; + goto out; + } + } + + for (i = 0; i < max_desc; i++) { + if (!lro_arr[i].active) { + lro_desc = &lro_arr[i]; + goto out; + } + } + + LRO_INC_STATS(lro_mgr, no_desc); +out: + return lro_desc; +} + +static void lro_flush(struct net_lro_mgr *lro_mgr, + struct net_lro_desc *lro_desc) +{ + if (lro_desc->pkt_aggr_cnt > 1) + lro_update_tcp_ip_header(lro_desc); + + skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss; + + if (lro_desc->vgrp) { + if (test_bit(LRO_F_NAPI, &lro_mgr->features)) + vlan_hwaccel_receive_skb(lro_desc->parent, + lro_desc->vgrp, + lro_desc->vlan_tag); + else + vlan_hwaccel_rx(lro_desc->parent, + lro_desc->vgrp, + lro_desc->vlan_tag); + + } else { + if (test_bit(LRO_F_NAPI, &lro_mgr->features)) + netif_receive_skb(lro_desc->parent); + else + netif_rx(lro_desc->parent); + } + + LRO_INC_STATS(lro_mgr, flushed); + lro_clear_desc(lro_desc); +} + +static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, + struct vlan_group *vgrp, u16 vlan_tag, void *priv) +{ + struct net_lro_desc *lro_desc; + struct iphdr *iph; + struct tcphdr *tcph; + u64 flags; + int vlan_hdr_len = 0; + + if (!lro_mgr->get_skb_header + || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, + &flags, priv)) + goto out; + + if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) + goto out; + + lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); + if (!lro_desc) + goto out; + + if ((skb->protocol == htons(ETH_P_8021Q)) + && !test_bit(LRO_F_EXTRACT_VLAN_ID, &lro_mgr->features)) + vlan_hdr_len = VLAN_HLEN; + + if (!lro_desc->active) { /* start new lro session */ + if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL)) + goto out; + + skb->ip_summed = lro_mgr->ip_summed_aggr; + lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp); + LRO_INC_STATS(lro_mgr, aggregated); + return 0; + } + + if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) + goto out2; + + if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc)) + goto out2; + + lro_add_packet(lro_desc, skb, iph, tcph); + LRO_INC_STATS(lro_mgr, aggregated); + + if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) || + lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) + lro_flush(lro_mgr, lro_desc); + + return 0; + +out2: /* send aggregated SKBs to stack */ + lro_flush(lro_mgr, lro_desc); + +out: /* Original SKB has to be posted to stack */ + skb->ip_summed = lro_mgr->ip_summed; + return 1; +} + + +static struct sk_buff *lro_gen_skb(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, + void *mac_hdr, + int hlen, __wsum sum, + u32 ip_summed) +{ + struct sk_buff *skb; + struct skb_frag_struct *skb_frags; + int data_len = len; + int hdr_len = min(len, hlen); + + skb = netdev_alloc_skb(lro_mgr->dev, hlen); + if (!skb) + return NULL; + + skb->len = len; + skb->data_len = len - hdr_len; + skb->truesize += true_size; + skb->tail += hdr_len; + + memcpy(skb->data, mac_hdr, hdr_len); + + skb_frags = skb_shinfo(skb)->frags; + while (data_len > 0) { + *skb_frags = *frags; + data_len -= frags->size; + skb_frags++; + frags++; + skb_shinfo(skb)->nr_frags++; + } + + skb_shinfo(skb)->frags[0].page_offset += hdr_len; + skb_shinfo(skb)->frags[0].size -= hdr_len; + + skb->ip_summed = ip_summed; + skb->csum = sum; + skb->protocol = eth_type_trans(skb, lro_mgr->dev); + return skb; +} + +static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, + struct vlan_group *vgrp, + u16 vlan_tag, void *priv, __wsum sum) +{ + struct net_lro_desc *lro_desc; + struct iphdr *iph; + struct tcphdr *tcph; + struct sk_buff *skb; + u64 flags; + void *mac_hdr; + int mac_hdr_len; + int hdr_len = LRO_MAX_PG_HLEN; + int vlan_hdr_len = 0; + + if (!lro_mgr->get_frag_header + || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, + (void *)&tcph, &flags, priv)) { + mac_hdr = page_address(frags->page) + frags->page_offset; + goto out1; + } + + if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) + goto out1; + + hdr_len = (int)((void *)(tcph) + TCP_HDR_LEN(tcph) - mac_hdr); + mac_hdr_len = (int)((void *)(iph) - mac_hdr); + + lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); + if (!lro_desc) + goto out1; + + if (!lro_desc->active) { /* start new lro session */ + if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, NULL)) + goto out1; + + skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, + hdr_len, 0, lro_mgr->ip_summed_aggr); + if (!skb) + goto out; + + if ((skb->protocol == htons(ETH_P_8021Q)) + && !test_bit(LRO_F_EXTRACT_VLAN_ID, &lro_mgr->features)) + vlan_hdr_len = VLAN_HLEN; + + iph = (void *)(skb->data + vlan_hdr_len); + tcph = (void *)((u8 *)skb->data + vlan_hdr_len + + IP_HDR_LEN(iph)); + + lro_init_desc(lro_desc, skb, iph, tcph, 0, NULL); + LRO_INC_STATS(lro_mgr, aggregated); + return 0; + } + + if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) + goto out2; + + if (lro_tcp_ip_check(iph, tcph, len - mac_hdr_len, lro_desc)) + goto out2; + + lro_add_frags(lro_desc, len, hdr_len, true_size, frags, iph, tcph); + LRO_INC_STATS(lro_mgr, aggregated); + + if ((skb_shinfo(lro_desc->parent)->nr_frags >= lro_mgr->max_aggr) || + lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) + lro_flush(lro_mgr, lro_desc); + + return NULL; + +out2: /* send aggregated packets to the stack */ + lro_flush(lro_mgr, lro_desc); + +out1: /* Original packet has to be posted to the stack */ + skb = lro_gen_skb(lro_mgr, frags, len, true_size, mac_hdr, + hdr_len, sum, lro_mgr->ip_summed); +out: + return skb; +} + +void lro_receive_skb(struct net_lro_mgr *lro_mgr, + struct sk_buff *skb, + void *priv) +{ + if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) { + if (test_bit(LRO_F_NAPI, &lro_mgr->features)) + netif_receive_skb(skb); + else + netif_rx(skb); + } +} +EXPORT_SYMBOL(lro_receive_skb); + +void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr, + struct sk_buff *skb, + struct vlan_group *vgrp, + u16 vlan_tag, + void *priv) +{ + if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) { + if (test_bit(LRO_F_NAPI, &lro_mgr->features)) + vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag); + else + vlan_hwaccel_rx(skb, vgrp, vlan_tag); + } +} +EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb); + +void lro_receive_frags(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, void *priv, __wsum sum) +{ + struct sk_buff *skb; + + skb = __lro_proc_segment(lro_mgr, frags, len, true_size, NULL, 0, + priv, sum); + if (!skb) + return; + + if (test_bit(LRO_F_NAPI, &lro_mgr->features)) + netif_receive_skb(skb); + else + netif_rx(skb); +} +EXPORT_SYMBOL(lro_receive_frags); + +void lro_vlan_hwaccel_receive_frags(struct net_lro_mgr *lro_mgr, + struct skb_frag_struct *frags, + int len, int true_size, + struct vlan_group *vgrp, + u16 vlan_tag, void *priv, __wsum sum) +{ + struct sk_buff *skb; + + skb = __lro_proc_segment(lro_mgr, frags, len, true_size, vgrp, + vlan_tag, priv, sum); + if (!skb) + return; + + if (test_bit(LRO_F_NAPI, &lro_mgr->features)) + vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag); + else + vlan_hwaccel_rx(skb, vgrp, vlan_tag); +} +EXPORT_SYMBOL(lro_vlan_hwaccel_receive_frags); + +void lro_flush_all(struct net_lro_mgr *lro_mgr) +{ + int i; + struct net_lro_desc *lro_desc = lro_mgr->lro_arr; + + for (i = 0; i < lro_mgr->max_desc; i++) { + if (lro_desc[i].active) + lro_flush(lro_mgr, &lro_desc[i]); + } +} +EXPORT_SYMBOL(lro_flush_all); + +void lro_flush_pkt(struct net_lro_mgr *lro_mgr, + struct iphdr *iph, struct tcphdr *tcph) +{ + struct net_lro_desc *lro_desc; + + lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); + if (lro_desc->active) + lro_flush(lro_mgr, lro_desc); +} +EXPORT_SYMBOL(lro_flush_pkt); -- cgit v1.2.3 From d738cd8fca948e45d53120247cb7a5f5be3ca09e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Sat, 24 Mar 2007 21:03:23 -0700 Subject: [TCP]: Add highest_sack seqno, points to globally highest SACK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is guaranteed to be valid only when !tp->sacked_out. In most cases this seqno is available in the last ACK but there is no guarantee for that. The new fast recovery loss marking algorithm needs this as entry point. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f893e90061eb..813f2049b85d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -979,8 +979,10 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int i; int first_sack_index; - if (!tp->sacked_out) + if (!tp->sacked_out) { tp->fackets_out = 0; + tp->highest_sack = tp->snd_una; + } prior_fackets = tp->fackets_out; /* Check for D-SACK. */ @@ -1217,6 +1219,10 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (fack_count > tp->fackets_out) tp->fackets_out = fack_count; + + if (after(TCP_SKB_CB(skb)->seq, + tp->highest_sack)) + tp->highest_sack = TCP_SKB_CB(skb)->seq; } else { if (dup_sack && (sacked&TCPCB_RETRANS)) reord = min(fack_count, reord); -- cgit v1.2.3 From d8f4f2235abc7b30cf447ca3e22ac28326b12f28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 20 Apr 2007 22:56:38 -0700 Subject: [TCP]: Extracted rexmit hint clearing from the LOST marking code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 813f2049b85d..7d843c429381 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1765,6 +1765,20 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) tp->left_out = tp->lost_out; } +/* RFC: This is from the original, I doubt that this is necessary at all: + * clear xmit_retrans hint if seq of this skb is beyond hint. How could we + * retransmitted past LOST markings in the first place? I'm not fully sure + * about undo and end of connection cases, which can cause R without L? + */ +static void tcp_verify_retransmit_hint(struct tcp_sock *tp, + struct sk_buff *skb) +{ + if ((tp->retransmit_skb_hint != NULL) && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + tp->retransmit_skb_hint = skb; +} + /* Mark head of queue up as lost. */ static void tcp_mark_head_lost(struct sock *sk, int packets, u32 high_seq) @@ -1795,14 +1809,7 @@ static void tcp_mark_head_lost(struct sock *sk, if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - - /* clear xmit_retransmit_queue hints - * if this is beyond hint */ - if (tp->retransmit_skb_hint != NULL && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) - tp->retransmit_skb_hint = NULL; - + tcp_verify_retransmit_hint(tp, skb); } } tcp_sync_left_out(tp); @@ -1843,13 +1850,7 @@ static void tcp_update_scoreboard(struct sock *sk) if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - - /* clear xmit_retrans hint */ - if (tp->retransmit_skb_hint && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) - - tp->retransmit_skb_hint = NULL; + tcp_verify_retransmit_hint(tp, skb); } } -- cgit v1.2.3 From 19b2b486580f5939688d3e225acdc0f4b291ed0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 28 Mar 2007 12:06:37 -0700 Subject: [TCP]: Rexmit hint must be cleared instead of setting it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stupid error from my side. Even though now that I noticed this, I hoped it would have been an optimization but no, the counter hint is then incorrect. Thus clearing is necessary for now (I still suspect though that this path is never executed). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7d843c429381..0aa17243d369 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1776,7 +1776,7 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, if ((tp->retransmit_skb_hint != NULL) && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) - tp->retransmit_skb_hint = skb; + tp->retransmit_skb_hint = NULL; } /* Mark head of queue up as lost. */ -- cgit v1.2.3 From d06e021d71d95aae402340dc3d9f79f9c8ad11d7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 18 Jun 2007 22:43:06 -0700 Subject: [TCP]: Extract DSACK detection code from tcp_sacktag_write_queue(). Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 56 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 20 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0aa17243d369..5187870d0333 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -960,6 +960,39 @@ static void tcp_update_reordering(struct sock *sk, const int metric, * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. */ +static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, + struct tcp_sack_block_wire *sp, int num_sacks, + u32 prior_snd_una) +{ + u32 start_seq_0 = ntohl(get_unaligned(&sp[0].start_seq)); + u32 end_seq_0 = ntohl(get_unaligned(&sp[0].end_seq)); + int dup_sack = 0; + + if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { + dup_sack = 1; + tp->rx_opt.sack_ok |= 4; + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); + } else if (num_sacks > 1) { + u32 end_seq_1 = ntohl(get_unaligned(&sp[1].end_seq)); + u32 start_seq_1 = ntohl(get_unaligned(&sp[1].start_seq)); + + if (!after(end_seq_0, end_seq_1) && + !before(start_seq_0, start_seq_1)) { + dup_sack = 1; + tp->rx_opt.sack_ok |= 4; + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); + } + } + + /* D-SACK for already forgotten data... Do dumb counting. */ + if (dup_sack && + !after(end_seq_0, prior_snd_una) && + after(end_seq_0, tp->undo_marker)) + tp->undo_retrans--; + + return dup_sack; +} + static int tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) { @@ -985,27 +1018,10 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } prior_fackets = tp->fackets_out; - /* Check for D-SACK. */ - if (before(ntohl(sp[0].start_seq), TCP_SKB_CB(ack_skb)->ack_seq)) { - flag |= FLAG_DSACKING_ACK; - found_dup_sack = 1; - tp->rx_opt.sack_ok |= 4; - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); - } else if (num_sacks > 1 && - !after(ntohl(sp[0].end_seq), ntohl(sp[1].end_seq)) && - !before(ntohl(sp[0].start_seq), ntohl(sp[1].start_seq))) { + found_dup_sack = tcp_check_dsack(tp, ack_skb, sp, + num_sacks, prior_snd_una); + if (found_dup_sack) flag |= FLAG_DSACKING_ACK; - found_dup_sack = 1; - tp->rx_opt.sack_ok |= 4; - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); - } - - /* D-SACK for already forgotten data... - * Do dumb counting. */ - if (found_dup_sack && - !after(ntohl(sp[0].end_seq), prior_snd_una) && - after(ntohl(sp[0].end_seq), tp->undo_marker)) - tp->undo_retrans--; /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can -- cgit v1.2.3 From 4ddf66769d2df868071420e2e0106746c6204ea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Sun, 27 May 2007 01:52:00 -0700 Subject: [TCP]: Move Reno SACKed_out counter functions earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 98 +++++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 51 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5187870d0333..2711ef7df7b5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1314,6 +1314,53 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ /* F-RTO can only be used if TCP has never retransmitted anything other than * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) */ +static void tcp_check_reno_reordering(struct sock *sk, const int addend) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 holes; + + holes = max(tp->lost_out, 1U); + holes = min(holes, tp->packets_out); + + if ((tp->sacked_out + holes) > tp->packets_out) { + tp->sacked_out = tp->packets_out - holes; + tcp_update_reordering(sk, tp->packets_out + addend, 0); + } +} + +/* Emulate SACKs for SACKless connection: account for a new dupack. */ + +static void tcp_add_reno_sack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + tp->sacked_out++; + tcp_check_reno_reordering(sk, 0); + tcp_sync_left_out(tp); +} + +/* Account for ACK, ACKing some data in Reno Recovery phase. */ + +static void tcp_remove_reno_sacks(struct sock *sk, int acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (acked > 0) { + /* One ACK acked hole. The rest eat duplicate ACKs. */ + if (acked-1 >= tp->sacked_out) + tp->sacked_out = 0; + else + tp->sacked_out -= acked-1; + } + tcp_check_reno_reordering(sk, acked); + tcp_sync_left_out(tp); +} + +static inline void tcp_reset_reno_sack(struct tcp_sock *tp) +{ + tp->sacked_out = 0; + tp->left_out = tp->lost_out; +} + int tcp_use_frto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -1730,57 +1777,6 @@ static int tcp_time_to_recover(struct sock *sk) return 0; } -/* If we receive more dupacks than we expected counting segments - * in assumption of absent reordering, interpret this as reordering. - * The only another reason could be bug in receiver TCP. - */ -static void tcp_check_reno_reordering(struct sock *sk, const int addend) -{ - struct tcp_sock *tp = tcp_sk(sk); - u32 holes; - - holes = max(tp->lost_out, 1U); - holes = min(holes, tp->packets_out); - - if ((tp->sacked_out + holes) > tp->packets_out) { - tp->sacked_out = tp->packets_out - holes; - tcp_update_reordering(sk, tp->packets_out + addend, 0); - } -} - -/* Emulate SACKs for SACKless connection: account for a new dupack. */ - -static void tcp_add_reno_sack(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - tp->sacked_out++; - tcp_check_reno_reordering(sk, 0); - tcp_sync_left_out(tp); -} - -/* Account for ACK, ACKing some data in Reno Recovery phase. */ - -static void tcp_remove_reno_sacks(struct sock *sk, int acked) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (acked > 0) { - /* One ACK acked hole. The rest eat duplicate ACKs. */ - if (acked-1 >= tp->sacked_out) - tp->sacked_out = 0; - else - tp->sacked_out -= acked-1; - } - tcp_check_reno_reordering(sk, acked); - tcp_sync_left_out(tp); -} - -static inline void tcp_reset_reno_sack(struct tcp_sock *tp) -{ - tp->sacked_out = 0; - tp->left_out = tp->lost_out; -} - /* RFC: This is from the original, I doubt that this is necessary at all: * clear xmit_retrans hint if seq of this skb is beyond hint. How could we * retransmitted past LOST markings in the first place? I'm not fully sure -- cgit v1.2.3 From 9bff40fda015c4d0b57b444626cdcbf66066dbe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Sun, 27 May 2007 01:53:49 -0700 Subject: [TCP] FRTO: remove unnecessary fackets/sacked_out recounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit F-RTO does not touch SACKED_ACKED bits at all, so there is no need to recount them in tcp_enter_frto_loss. After removal of the else branch, nested ifs can be combined. This must also reset sacked_out when SACK is not in use as TCP could have received some duplicate ACKs prior RTO. To achieve that in a sane manner, tcp_reset_reno_sack was re-placed by the previous patch. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2711ef7df7b5..29999ef73b43 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1474,17 +1474,15 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt = 0; - tp->sacked_out = 0; tp->lost_out = 0; - tp->fackets_out = 0; tp->retrans_out = 0; + if (IsReno(tp)) + tcp_reset_reno_sack(tp); tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) break; - cnt += tcp_skb_pcount(skb); /* * Count the retransmission made on RTO correctly (only when * waiting for the first ACK and did not get it)... @@ -1498,19 +1496,12 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) } else { TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); } - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { - /* Do not mark those segments lost that were - * forward transmitted after RTO - */ - if (!after(TCP_SKB_CB(skb)->end_seq, - tp->frto_highmark)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - } - } else { - tp->sacked_out += tcp_skb_pcount(skb); - tp->fackets_out = cnt; + /* Don't lost mark skbs that were fwd transmitted after RTO */ + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) && + !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); } } tcp_sync_left_out(tp); -- cgit v1.2.3 From 539d243fdd7900fa5a544c7c154dc3ddf627e840 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Sun, 27 May 2007 02:03:20 -0700 Subject: [TCP]: Access to highest_sack obsoletes forward_cnt_hint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In addition, added a reference about the purpose of the loop. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 666d8a58d14a..b11025e8a49e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1945,33 +1945,28 @@ void tcp_xmit_retransmit_queue(struct sock *sk) * and retransmission... Both ways have their merits... * * For now we do not retransmit anything, while we have some new - * segments to send. + * segments to send. In the other cases, follow rule 3 for + * NextSeg() specified in RFC3517. */ if (tcp_may_send_now(sk)) return; - if (tp->forward_skb_hint) { + /* If nothing is SACKed, highest_sack in the loop won't be valid */ + if (!tp->sacked_out) + return; + + if (tp->forward_skb_hint) skb = tp->forward_skb_hint; - packet_cnt = tp->forward_cnt_hint; - } else{ + else skb = tcp_write_queue_head(sk); - packet_cnt = 0; - } tcp_for_write_queue_from(skb, sk) { if (skb == tcp_send_head(sk)) break; - tp->forward_cnt_hint = packet_cnt; tp->forward_skb_hint = skb; - /* Similar to the retransmit loop above we - * can pretend that the retransmitted SKB - * we send out here will be composed of one - * real MSS sized packet because tcp_retransmit_skb() - * will fragment it if necessary. - */ - if (++packet_cnt > tp->fackets_out) + if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack)) break; if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) -- cgit v1.2.3 From bdf1ee5d3bd38d0c44bd7baa74e07adcbe4ceab1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Sun, 27 May 2007 02:04:16 -0700 Subject: [TCP]: Move code from tcp_ecn.h to tcp*.c and tcp.h & remove it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No other users exist for tcp_ecn.h. Very few things remain in tcp.h, for most TCP ECN functions callers reside within a single .c file and can be placed there. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_minisocks.c | 6 ++++++ net/ipv4/tcp_output.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 29999ef73b43..ea690afa592a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -118,6 +118,7 @@ int sysctl_tcp_abc __read_mostly; #define IsSackFrto() (sysctl_tcp_frto == 0x2) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) +#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) /* Adapt the MSS value used to make delayed ack decision to the * real world. @@ -198,6 +199,55 @@ static inline int tcp_in_quickack_mode(const struct sock *sk) return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } +static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) +{ + if (tp->ecn_flags&TCP_ECN_OK) + tp->ecn_flags |= TCP_ECN_QUEUE_CWR; +} + +static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb) +{ + if (tcp_hdr(skb)->cwr) + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) +{ + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) +{ + if (tp->ecn_flags&TCP_ECN_OK) { + if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + /* Funny extension: if ECT is not set on a segment, + * it is surely retransmit. It is not in ECN RFC, + * but Linux follows this rule. */ + else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) + tcp_enter_quickack_mode((struct sock *)tp); + } +} + +static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th) +{ + if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr)) + tp->ecn_flags &= ~TCP_ECN_OK; +} + +static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th) +{ + if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr)) + tp->ecn_flags &= ~TCP_ECN_OK; +} + +static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th) +{ + if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK)) + return 1; + return 0; +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a12b08fca5ad..36a8fbd0e64e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -368,6 +368,12 @@ void tcp_twsk_destructor(struct sock *sk) EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +static inline void TCP_ECN_openreq_child(struct tcp_sock *tp, + struct request_sock *req) +{ + tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; +} + /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b11025e8a49e..3abe22e4b576 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -269,6 +269,56 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } +static inline void TCP_ECN_send_synack(struct tcp_sock *tp, + struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; + if (!(tp->ecn_flags&TCP_ECN_OK)) + TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; +} + +static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->ecn_flags = 0; + if (sysctl_tcp_ecn) { + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR; + tp->ecn_flags = TCP_ECN_OK; + } +} + +static __inline__ void +TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th) +{ + if (inet_rsk(req)->ecn_ok) + th->ece = 1; +} + +static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, + int tcp_header_len) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->ecn_flags & TCP_ECN_OK) { + /* Not-retransmitted data segment: set ECT and inject CWR. */ + if (skb->len != tcp_header_len && + !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { + INET_ECN_xmit(sk); + if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) { + tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; + tcp_hdr(skb)->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } + } else { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } + if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) + tcp_hdr(skb)->ece = 1; + } +} + static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, __u32 tstamp, __u8 **md5_hash) { -- cgit v1.2.3 From af610b4ca19f513a50d47ea93ed57241383c8081 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 15 Jun 2007 12:58:38 +0300 Subject: [TCP]: Add tcp_dec_pcount_approx int variant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3abe22e4b576..3c8c8e7f6f6d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -740,22 +740,16 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss if (diff > 0) { /* Adjust Reno SACK estimate. */ if (!tp->rx_opt.sack_ok) { - tp->sacked_out -= diff; - if ((int)tp->sacked_out < 0) - tp->sacked_out = 0; + tcp_dec_pcount_approx_int(&tp->sacked_out, diff); tcp_sync_left_out(tp); } - tp->fackets_out -= diff; - if ((int)tp->fackets_out < 0) - tp->fackets_out = 0; + tcp_dec_pcount_approx_int(&tp->fackets_out, diff); /* SACK fastpath might overwrite it unless dealt with */ if (tp->fastpath_skb_hint != NULL && after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) { - tp->fastpath_cnt_hint -= diff; - if ((int)tp->fastpath_cnt_hint < 0) - tp->fastpath_cnt_hint = 0; + tcp_dec_pcount_approx_int(&tp->fastpath_cnt_hint, diff); } } } -- cgit v1.2.3 From 35e8694198ba94b62df8aa35fa6e52a1cfb86df2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 31 May 2007 10:16:47 +0300 Subject: [TCP]: Remove num_acked>0 checks from cong.ctrl mods pkts_acked MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no need for such check in pkts_acked because the callback is not invoked unless at least one segment got fully ACKed (i.e., the snd_una moved past skb's end_seq) by the cumulative ACK's snd_una advancement. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_bic.c | 2 +- net/ipv4/tcp_cubic.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 4586211e3757..5dba0fc8f579 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -210,7 +210,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + if (icsk->icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; ca->delayed_ack += cnt; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 485d7ea35f75..80bd084a9f91 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -314,7 +314,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) struct bictcp *ca = inet_csk_ca(sk); u32 delay; - if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + if (icsk->icsk_ca_state == TCP_CA_Open) { cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; ca->delayed_ack += cnt; } -- cgit v1.2.3 From b5860bbac7be1381626f3dc8a0cb970a60fcefb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 9 Aug 2007 14:33:18 +0300 Subject: [TCP]: Tighten tcp_sock's belt, drop left_out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is easily calculable when needed and user are not that many after all. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 11 ++--------- net/ipv4/tcp_minisocks.c | 1 - net/ipv4/tcp_output.c | 12 +++--------- 3 files changed, 5 insertions(+), 19 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ea690afa592a..957e0fb8afb7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1346,8 +1346,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } } - tp->left_out = tp->sacked_out + tp->lost_out; - if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); @@ -1408,7 +1406,6 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked) static inline void tcp_reset_reno_sack(struct tcp_sock *tp) { tp->sacked_out = 0; - tp->left_out = tp->lost_out; } int tcp_use_frto(struct sock *sk) @@ -1573,7 +1570,6 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) void tcp_clear_retrans(struct tcp_sock *tp) { - tp->left_out = 0; tp->retrans_out = 0; tp->fackets_out = 0; @@ -1973,7 +1969,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", msg, NIPQUAD(inet->daddr), ntohs(inet->dport), - tp->snd_cwnd, tp->left_out, + tp->snd_cwnd, tp->sacked_out + tp->lost_out, tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } @@ -2102,7 +2098,6 @@ static int tcp_try_undo_loss(struct sock *sk) DBGUNDO(sk, "partial loss"); tp->lost_out = 0; - tp->left_out = tp->sacked_out; tcp_undo_cwr(sk, 1); NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; @@ -2126,8 +2121,6 @@ static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - tcp_sync_left_out(tp); - if (tp->retrans_out == 0) tp->retrans_stamp = 0; @@ -2137,7 +2130,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; - if (tp->left_out || tp->retrans_out || tp->undo_marker) + if (tp->sacked_out || tp->retrans_out || tp->undo_marker) state = TCP_CA_Disorder; if (inet_csk(sk)->icsk_ca_state != state) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 36a8fbd0e64e..fdfe89fe646b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -405,7 +405,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; - newtp->left_out = 0; newtp->retrans_out = 0; newtp->sacked_out = 0; newtp->fackets_out = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3c8c8e7f6f6d..7434944caa8f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -732,10 +732,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= diff; - if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) tp->lost_out -= diff; - tp->left_out -= diff; - } if (diff > 0) { /* Adjust Reno SACK estimate. */ @@ -1727,15 +1725,11 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS) tp->retrans_out -= tcp_skb_pcount(next_skb); - if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) { + if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) tp->lost_out -= tcp_skb_pcount(next_skb); - tp->left_out -= tcp_skb_pcount(next_skb); - } /* Reno case is special. Sigh... */ - if (!tp->rx_opt.sack_ok && tp->sacked_out) { + if (!tp->rx_opt.sack_ok && tp->sacked_out) tcp_dec_pcount_approx(&tp->sacked_out, next_skb); - tp->left_out -= tcp_skb_pcount(next_skb); - } /* Not quite right: it can be > snd.fack, but * it is better to underestimate fackets. -- cgit v1.2.3 From 83ae40885f33e406c87c86b0bd4b6fd31a741f12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 9 Aug 2007 14:37:30 +0300 Subject: [TCP]: Add tcp_left_out(tp) "back" to get cleaner looking lines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tp->left_out got removed but nothing came to replace it back then (users just did addition by themselves), so add function for users now. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 957e0fb8afb7..6bdd053e252f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1969,7 +1969,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", msg, NIPQUAD(inet->daddr), ntohs(inet->dport), - tp->snd_cwnd, tp->sacked_out + tp->lost_out, + tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } -- cgit v1.2.3 From 005903bc3a0e8473fef809e8775db52dcd3cde63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 9 Aug 2007 14:44:16 +0300 Subject: [TCP]: Left out sync->verify (the new meaning of it) & definify MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Left_out was dropped a while ago, thus leaving verifying consistency of the "left out" as only task for the function in question. Thus make it's name more appropriate. In addition, it is intentionally converted to #define instead of static inline because the location of the invariant failure is the most important thing to have if this ever triggers. I think it would have been helpful e.g. in this case where the location of the failure point had to be based on some quesswork: http://lkml.org/lkml/2007/5/2/464 ...Luckily the guesswork seems to have proved to be correct. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 20 ++++++++++---------- net/ipv4/tcp_output.c | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6bdd053e252f..b11bd1624227 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1383,7 +1383,7 @@ static void tcp_add_reno_sack(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->sacked_out++; tcp_check_reno_reordering(sk, 0); - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); } /* Account for ACK, ACKing some data in Reno Recovery phase. */ @@ -1400,7 +1400,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked) tp->sacked_out -= acked-1; } tcp_check_reno_reordering(sk, acked); - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); } static inline void tcp_reset_reno_sack(struct tcp_sock *tp) @@ -1496,7 +1496,7 @@ void tcp_enter_frto(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); } - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); /* Earlier loss recovery underway (see RFC4138; Appendix B). * The last condition is necessary at least in tp->frto_counter case. @@ -1551,7 +1551,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->lost_out += tcp_skb_pcount(skb); } } - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; tp->snd_cwnd_cnt = 0; @@ -1626,7 +1626,7 @@ void tcp_enter_loss(struct sock *sk, int how) tp->fackets_out = cnt; } } - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); @@ -1861,7 +1861,7 @@ static void tcp_mark_head_lost(struct sock *sk, tcp_verify_retransmit_hint(tp, skb); } } - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); } /* Account newly detected lost packet(s) */ @@ -1905,7 +1905,7 @@ static void tcp_update_scoreboard(struct sock *sk) tp->scoreboard_skb_hint = skb; - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); } } @@ -2217,8 +2217,8 @@ tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag) NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); } - /* D. Synchronize left_out to current state. */ - tcp_sync_left_out(tp); + /* D. Check consistency of the current state. */ + tcp_verify_left_out(tp); /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ @@ -2765,7 +2765,7 @@ static int tcp_process_frto(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); /* Duplicate the behavior from Loss state (fastretrans_alert) */ if (flag&FLAG_DATA_ACKED) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7434944caa8f..a92fad55cd32 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -739,7 +739,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss /* Adjust Reno SACK estimate. */ if (!tp->rx_opt.sack_ok) { tcp_dec_pcount_approx_int(&tp->sacked_out, diff); - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); } tcp_dec_pcount_approx_int(&tp->fackets_out, diff); @@ -1774,7 +1774,7 @@ void tcp_simple_retransmit(struct sock *sk) if (!lost) return; - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); /* Don't muck with the congestion window here. * Reason is that we do not increase amount of _data_ -- cgit v1.2.3 From 86426c22d24e0c904012711a14cb5021f4a167dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 9 Aug 2007 14:45:17 +0300 Subject: [TCP]: Restore over-zealous tcp_sync_left_out-like removals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tcp_verify_left_out is useful for verifying S+L condition, so add it back to couple of places in where the code was not calling to tcp_sync_left_out but used own ad-hoc solution (before the tcp_sync_left_out got removed). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b11bd1624227..93823b83522b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1346,6 +1346,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } } + tcp_verify_left_out(tp); + if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); @@ -2121,6 +2123,8 @@ static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); + tcp_verify_left_out(tp); + if (tp->retrans_out == 0) tp->retrans_stamp = 0; -- cgit v1.2.3 From d02596e32925edaeccee0af8eb6c229b5615de42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Sat, 7 Jul 2007 13:39:02 +0300 Subject: [TCP]: Keep state in Disorder also if only lost_out > 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This happens rather infrequently and is only possible during FRTO. We must not allow TCP to slip to Open state because tcp_fastretrans_alert might then not be called on it's time when FRTO has exited. This become a problem when left_out got removed and was replaced by just sacked_out. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 93823b83522b..bf4fc3516fb9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2134,7 +2134,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; - if (tp->sacked_out || tp->retrans_out || tp->undo_marker) + if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) state = TCP_CA_Disorder; if (inet_csk(sk)->icsk_ca_state != state) { -- cgit v1.2.3 From 1b6d427bb7eb69e6dc4f194a5b0f4a382a16ff82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 9 Aug 2007 14:53:36 +0300 Subject: [TCP]: Reduce sacked_out with reno when purging write_queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously TCP had a transitional state during which reno counted segments that are already below the current window into sacked_out, which is now prevented. In addition, re-try now the unconditional S+L skb catching. This approach conservatively calls just remove_sack and leaves reset_sack() calls alone. The best solution to the whole problem would be to first calculate the new sacked_out fully (this patch does not move reno_sack_reset calls from original sites and thus does not implement this). However, that would require very invasive change to fastretrans_alert (perhaps even slicing it to two halves). Alternatively, all callers of tcp_packets_in_flight (i.e., users that depend on sacked_out) should be postponed until the new sacked_out has been calculated but it isn't any simpler alternative. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf4fc3516fb9..f8af018dd224 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2187,7 +2187,7 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) * tcp_xmit_retransmit_queue(). */ static void -tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag) +tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2273,12 +2273,8 @@ tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag) if (!(flag & FLAG_SND_UNA_ADVANCED)) { if (IsReno(tp) && is_dupack) tcp_add_reno_sack(sk); - } else { - int acked = prior_packets - tp->packets_out; - if (IsReno(tp)) - tcp_remove_reno_sacks(sk, acked); - do_lost = tcp_try_undo_partial(sk, acked); - } + } else + do_lost = tcp_try_undo_partial(sk, pkts_acked); break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) @@ -2577,6 +2573,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk); + if (IsReno(tp)) + tcp_remove_reno_sacks(sk, pkts_acked); + if (ca_ops->pkts_acked) { s32 rtt_us = -1; @@ -2927,7 +2926,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight, 0); - tcp_fastretrans_alert(sk, prior_packets, flag); + tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) tcp_cong_avoid(sk, ack, prior_in_flight, 1); -- cgit v1.2.3 From e60402d0a909ca2e6e2fbdf9ed004ef0fae36d33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 9 Aug 2007 15:14:46 +0300 Subject: [TCP]: Move sack_ok access to obviously named funcs & cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously code had IsReno/IsFack defined as macros that were local to tcp_input.c though sack_ok field has user elsewhere too for the same purpose. This changes them to static inlines as preferred according the current coding style and unifies the access to sack_ok across multiple files. Magic bitops of sack_ok for FACK and DSACK are also abstracted to functions with appropriate names. Note: - One sack_ok = 1 remains but that's self explanary, i.e., it enables sack - Couple of !IsReno cases are changed to tcp_is_sack - There were no users for IsDSack => I dropped it Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 82 +++++++++++++++++++++++++++--------------------- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 6 ++-- net/ipv4/tcp_timer.c | 2 +- 5 files changed, 52 insertions(+), 42 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7e740112b238..aff31427f525 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2014,7 +2014,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rx_opt.tstamp_ok) info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) info->tcpi_options |= TCPI_OPT_SACK; if (tp->rx_opt.wscale_ok) { info->tcpi_options |= TCPI_OPT_WSCALE; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f8af018dd224..faba9beb3613 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -111,10 +111,6 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) #define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) -#define IsReno(tp) ((tp)->rx_opt.sack_ok == 0) -#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2) -#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4) - #define IsSackFrto() (sysctl_tcp_frto == 0x2) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) @@ -860,6 +856,21 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) } } +/* + * Packet counting of FACK is based on in-order assumptions, therefore TCP + * disables it when reordering is detected + */ +static void tcp_disable_fack(struct tcp_sock *tp) +{ + tp->rx_opt.sack_ok &= ~2; +} + +/* Take a notice that peer is sending DSACKs */ +static void tcp_dsack_seen(struct tcp_sock *tp) +{ + tp->rx_opt.sack_ok |= 4; +} + /* Initialize metrics on socket. */ static void tcp_init_metrics(struct sock *sk) @@ -881,7 +892,7 @@ static void tcp_init_metrics(struct sock *sk) } if (dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { - tp->rx_opt.sack_ok &= ~2; + tcp_disable_fack(tp); tp->reordering = dst_metric(dst, RTAX_REORDERING); } @@ -943,9 +954,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric, /* This exciting event is worth to be remembered. 8) */ if (ts) NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER); - else if (IsReno(tp)) + else if (tcp_is_reno(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER); - else if (IsFack(tp)) + else if (tcp_is_fack(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); @@ -957,8 +968,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif - /* Disable FACK yet. */ - tp->rx_opt.sack_ok &= ~2; + tcp_disable_fack(tp); } } @@ -1020,7 +1030,7 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { dup_sack = 1; - tp->rx_opt.sack_ok |= 4; + tcp_dsack_seen(tp); NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = ntohl(get_unaligned(&sp[1].end_seq)); @@ -1029,7 +1039,7 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, if (!after(end_seq_0, end_seq_1) && !before(start_seq_0, start_seq_1)) { dup_sack = 1; - tp->rx_opt.sack_ok |= 4; + tcp_dsack_seen(tp); NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); } } @@ -1326,7 +1336,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ continue; if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) && after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) && - (IsFack(tp) || + (tcp_is_fack(tp) || !before(lost_retrans, TCP_SKB_CB(skb)->ack_seq + tp->reordering * tp->mss_cache))) { @@ -1526,7 +1536,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->lost_out = 0; tp->retrans_out = 0; - if (IsReno(tp)) + if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); tcp_for_write_queue(skb, sk) { @@ -1668,7 +1678,7 @@ static int tcp_check_sack_reneging(struct sock *sk) static inline int tcp_fackets_out(struct tcp_sock *tp) { - return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; + return tcp_is_reno(tp) ? tp->sacked_out+1 : tp->fackets_out; } static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) @@ -1872,7 +1882,7 @@ static void tcp_update_scoreboard(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (IsFack(tp)) { + if (tcp_is_fack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; @@ -1886,7 +1896,7 @@ static void tcp_update_scoreboard(struct sock *sk) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (!IsReno(tp) && tcp_head_timedout(sk)) { + if (!tcp_is_reno(tp) && tcp_head_timedout(sk)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint @@ -1938,7 +1948,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag) int decr = tp->snd_cwnd_cnt + 1; if ((flag&(FLAG_ANY_PROGRESS|FLAG_DSACKING_ACK)) || - (IsReno(tp) && !(flag&FLAG_NOT_DUP))) { + (tcp_is_reno(tp) && !(flag&FLAG_NOT_DUP))) { tp->snd_cwnd_cnt = decr&1; decr >>= 1; @@ -2029,7 +2039,7 @@ static int tcp_try_undo_recovery(struct sock *sk) NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); tp->undo_marker = 0; } - if (tp->snd_una == tp->high_seq && IsReno(tp)) { + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ @@ -2059,7 +2069,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) { struct tcp_sock *tp = tcp_sk(sk); /* Partial ACK arrived. Force Hoe's retransmit. */ - int failed = IsReno(tp) || tp->fackets_out>tp->reordering; + int failed = tcp_is_reno(tp) || tp->fackets_out>tp->reordering; if (tcp_may_undo(tp)) { /* Plain luck! Hole if filled with delayed @@ -2104,7 +2114,7 @@ static int tcp_try_undo_loss(struct sock *sk) NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; - if (!IsReno(tp)) + if (tcp_is_sack(tp)) tcp_set_ca_state(sk, TCP_CA_Open); return 1; } @@ -2251,14 +2261,14 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) if (!tp->undo_marker || /* For SACK case do not Open to allow to undo * catching for all duplicate ACKs. */ - IsReno(tp) || tp->snd_una != tp->high_seq) { + tcp_is_reno(tp) || tp->snd_una != tp->high_seq) { tp->undo_marker = 0; tcp_set_ca_state(sk, TCP_CA_Open); } break; case TCP_CA_Recovery: - if (IsReno(tp)) + if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); if (tcp_try_undo_recovery(sk)) return; @@ -2271,7 +2281,7 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { - if (IsReno(tp) && is_dupack) + if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else do_lost = tcp_try_undo_partial(sk, pkts_acked); @@ -2288,7 +2298,7 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) return; /* Loss is undone; fall through to processing in Open state. */ default: - if (IsReno(tp)) { + if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); if (is_dupack) @@ -2316,7 +2326,7 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) /* Otherwise enter Recovery state */ - if (IsReno(tp)) + if (tcp_is_reno(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY); @@ -2573,7 +2583,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk); - if (IsReno(tp)) + if (tcp_is_reno(tp)) tcp_remove_reno_sacks(sk, pkts_acked); if (ca_ops->pkts_acked) { @@ -2599,7 +2609,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) BUG_TRAP((int)tp->sacked_out >= 0); BUG_TRAP((int)tp->lost_out >= 0); BUG_TRAP((int)tp->retrans_out >= 0); - if (!tp->packets_out && tp->rx_opt.sack_ok) { + if (!tp->packets_out && tcp_is_sack(tp)) { const struct inet_connection_sock *icsk = inet_csk(sk); if (tp->lost_out) { printk(KERN_DEBUG "Leak l=%u %d\n", @@ -2779,7 +2789,7 @@ static int tcp_process_frto(struct sock *sk, int flag) return 1; } - if (!IsSackFrto() || IsReno(tp)) { + if (!IsSackFrto() || tcp_is_reno(tp)) { /* RFC4138 shortcoming in step 2; should also have case c): * ACK isn't duplicate nor advances window, e.g., opposite dir * data, winupdate @@ -3263,7 +3273,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) * Probably, we should reset in this case. For now drop them. */ __skb_queue_purge(&tp->out_of_order_queue); - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_stream_mem_reclaim(sk); @@ -3293,7 +3303,7 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_se static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) { - if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { if (before(seq, tp->rcv_nxt)) NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT); else @@ -3323,7 +3333,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); - if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) @@ -3639,7 +3649,7 @@ drop: if (!skb_peek(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ - if (tp->rx_opt.sack_ok) { + if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; tp->rx_opt.dsack = 0; tp->rx_opt.eff_sacks = 1; @@ -3704,7 +3714,7 @@ drop: } add_sack: - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) tcp_sack_new_ofo_skb(sk, seq, end_seq); } } @@ -3893,7 +3903,7 @@ static int tcp_prune_queue(struct sock *sk) * is in a sad state like this, we care only about integrity * of the connection not performance. */ - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_stream_mem_reclaim(sk); } @@ -4594,8 +4604,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } - if (tp->rx_opt.sack_ok && sysctl_tcp_fack) - tp->rx_opt.sack_ok |= 2; + if (tcp_is_sack(tp) && sysctl_tcp_fack) + tcp_enable_fack(tp); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index fdfe89fe646b..b61b76847ad9 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -445,7 +445,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { if (sysctl_tcp_fack) - newtp->rx_opt.sack_ok |= 2; + tcp_enable_fack(newtp); } newtp->window_clamp = req->window_clamp; newtp->rcv_ssthresh = req->rcv_wnd; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a92fad55cd32..a3679174e78a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -737,7 +737,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss if (diff > 0) { /* Adjust Reno SACK estimate. */ - if (!tp->rx_opt.sack_ok) { + if (tcp_is_reno(tp)) { tcp_dec_pcount_approx_int(&tp->sacked_out, diff); tcp_verify_left_out(tp); } @@ -1728,7 +1728,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) tp->lost_out -= tcp_skb_pcount(next_skb); /* Reno case is special. Sigh... */ - if (!tp->rx_opt.sack_ok && tp->sacked_out) + if (tcp_is_reno(tp) && tp->sacked_out) tcp_dec_pcount_approx(&tp->sacked_out, next_skb); /* Not quite right: it can be > snd.fack, but @@ -1976,7 +1976,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) return; /* No forward retransmissions in Reno are possible. */ - if (!tp->rx_opt.sack_ok) + if (tcp_is_reno(tp)) return; /* Yeah, we have to make difficult choice between forward transmission diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index e9b151b3a598..d8970ecfcfc8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -315,7 +315,7 @@ static void tcp_retransmit_timer(struct sock *sk) if (icsk->icsk_retransmits == 0) { if (icsk->icsk_ca_state == TCP_CA_Disorder || icsk->icsk_ca_state == TCP_CA_Recovery) { - if (tp->rx_opt.sack_ok) { + if (tcp_is_sack(tp)) { if (icsk->icsk_ca_state == TCP_CA_Recovery) NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); else -- cgit v1.2.3 From 0680191642c27c81c9be4557d9c6aa3487c15f69 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 10 Aug 2007 15:22:13 -0700 Subject: [IPV4] fib_trie: cleanup Try this out: * replace macro's with inlines * get rid of places doing multiple evaluations of NODE_PARENT [akpm@linux-foundation.org: rcu_dereference wants an lval] Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 68 ++++++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 32 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 9ca786a6fd3c..f28f9f5f240c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -93,15 +93,8 @@ typedef unsigned int t_key; #define T_TNODE 0 #define T_LEAF 1 #define NODE_TYPE_MASK 0x1UL -#define NODE_PARENT(node) \ - ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK))) - #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK) -#define NODE_SET_PARENT(node, ptr) \ - rcu_assign_pointer((node)->parent, \ - ((unsigned long)(ptr)) | NODE_TYPE(node)) - #define IS_TNODE(n) (!(n->parent & T_LEAF)) #define IS_LEAF(n) (n->parent & T_LEAF) @@ -174,6 +167,19 @@ static void tnode_free(struct tnode *tn); static struct kmem_cache *fn_alias_kmem __read_mostly; static struct trie *trie_local = NULL, *trie_main = NULL; +static inline struct tnode *node_parent(struct node *node) +{ + struct tnode *ret; + + ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK); + return rcu_dereference(ret); +} + +static inline void node_set_parent(struct node *node, struct tnode *ptr) +{ + rcu_assign_pointer(node->parent, + (unsigned long)ptr | NODE_TYPE(node)); +} /* rcu_read_lock needs to be hold by caller from readside */ @@ -446,7 +452,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w tn->full_children++; if (n) - NODE_SET_PARENT(n, tn); + node_set_parent(n, tn); rcu_assign_pointer(tn->child[i], n); } @@ -481,7 +487,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) continue; /* compress one level */ - NODE_SET_PARENT(n, NULL); + node_set_parent(n, NULL); tnode_free(tn); return n; } @@ -636,7 +642,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* compress one level */ - NODE_SET_PARENT(n, NULL); + node_set_parent(n, NULL); tnode_free(tn); return n; } @@ -961,24 +967,21 @@ fib_find_node(struct trie *t, u32 key) static struct node *trie_rebalance(struct trie *t, struct tnode *tn) { int wasfull; - t_key cindex, key; - struct tnode *tp = NULL; - - key = tn->key; + t_key cindex, key = tn->key; + struct tnode *tp; - while (tn != NULL && NODE_PARENT(tn) != NULL) { - - tp = NODE_PARENT(tn); + while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); tn = (struct tnode *) resize (t, (struct tnode *)tn); tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); - if (!NODE_PARENT(tn)) + tp = node_parent((struct node *) tn); + if (!tp) break; - - tn = NODE_PARENT(tn); + tn = tp; } + /* Handle last (top) tnode */ if (IS_TNODE(tn)) tn = (struct tnode*) resize(t, (struct tnode *)tn); @@ -1031,7 +1034,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) pos = tn->pos + tn->bits; n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); - BUG_ON(n && NODE_PARENT(n) != tn); + BUG_ON(n && node_parent(n) != tn); } else break; } @@ -1083,7 +1086,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) if (t->trie && n == NULL) { /* Case 2: n is NULL, and will just insert a new leaf */ - NODE_SET_PARENT(l, tp); + node_set_parent((struct node *)l, tp); cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, (struct node *)l); @@ -1114,7 +1117,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) goto err; } - NODE_SET_PARENT(tn, tp); + node_set_parent((struct node *)tn, tp); missbit = tkey_extract_bits(key, newpos, 1); put_child(t, tn, missbit, (struct node *)l); @@ -1495,12 +1498,13 @@ backtrace: if (chopped_off <= pn->bits) { cindex &= ~(1 << (chopped_off-1)); } else { - if (NODE_PARENT(pn) == NULL) + struct tnode *parent = node_parent((struct node *) pn); + if (!parent) goto failed; /* Get Child's index */ - cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); - pn = NODE_PARENT(pn); + cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits); + pn = parent; chopped_off = 0; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -1536,7 +1540,7 @@ static int trie_leaf_remove(struct trie *t, t_key key) check_tnode(tn); n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); - BUG_ON(n && NODE_PARENT(n) != tn); + BUG_ON(n && node_parent(n) != tn); } l = (struct leaf *) n; @@ -1551,7 +1555,7 @@ static int trie_leaf_remove(struct trie *t, t_key key) t->revision++; t->size--; - tp = NODE_PARENT(n); + tp = node_parent(n); tnode_free((struct tnode *) n); if (tp) { @@ -1703,7 +1707,7 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) p = (struct tnode*) trie; /* Start */ } else - p = (struct tnode *) NODE_PARENT(c); + p = node_parent(c); while (p) { int pos, last; @@ -1740,7 +1744,7 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) up: /* No more children go up one step */ c = (struct node *) p; - p = (struct tnode *) NODE_PARENT(p); + p = node_parent(c); } return NULL; /* Ready. Root of trie */ } @@ -2043,7 +2047,7 @@ rescan: } /* Current node exhausted, pop back up */ - p = NODE_PARENT(tn); + p = node_parent((struct node *)tn); if (p) { cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; tn = p; @@ -2317,7 +2321,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) return 0; - if (!NODE_PARENT(n)) { + if (!node_parent(n)) { if (iter->trie == trie_local) seq_puts(seq, ":\n"); else -- cgit v1.2.3 From ab66b4a7a3969077f6e2a18a0d2d849d3b84a337 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 10 Aug 2007 15:22:58 -0700 Subject: [IPV4] fib_trie: macro cleanup This patch converts the messy macro for MASK_PFX to inline function and expands TKEY_GET_MASK in the one place it is used. Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index f28f9f5f240c..52b2891c63b7 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -85,8 +85,6 @@ #define MAX_STAT_DEPTH 32 #define KEYLENGTH (8*sizeof(t_key)) -#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) -#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) typedef unsigned int t_key; @@ -195,6 +193,11 @@ static inline int tnode_child_length(const struct tnode *tn) return 1 << tn->bits; } +static inline t_key mask_pfx(t_key k, unsigned short l) +{ + return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); +} + static inline t_key tkey_extract_bits(t_key a, int offset, int bits) { if (offset < KEYLENGTH) @@ -679,7 +682,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) inode->pos == oldtnode->pos + oldtnode->bits && inode->bits > 1) { struct tnode *left, *right; - t_key m = TKEY_GET_MASK(inode->pos, 1); + t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos; left = tnode_new(inode->key&(~m), inode->pos + 1, inode->bits - 1); @@ -1367,7 +1370,8 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result bits = pn->bits; if (!chopped_off) - cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits); + cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length), + pos, bits); n = tnode_get_child(pn, cindex); @@ -1453,8 +1457,8 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result * to find a matching prefix. */ - node_prefix = MASK_PFX(cn->key, cn->pos); - key_prefix = MASK_PFX(key, cn->pos); + node_prefix = mask_pfx(cn->key, cn->pos); + key_prefix = mask_pfx(key, cn->pos); pref_mismatch = key_prefix^node_prefix; mp = 0; @@ -2330,7 +2334,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) if (IS_TNODE(n)) { struct tnode *tn = (struct tnode *) n; - __be32 prf = htonl(MASK_PFX(tn->key, tn->pos)); + __be32 prf = htonl(mask_pfx(tn->key, tn->pos)); seq_indent(seq, iter->depth-1); seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n", -- cgit v1.2.3 From e9144bd8da80f3136b23c615609798e371e885ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 24 Aug 2007 22:43:14 -0700 Subject: [TCP]: Remove unnecessary wrapper tcp_packets_out_dec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes caller side more obvious, there's no need to have a wrapper for this oneliner! Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_output.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index faba9beb3613..593960d66ed9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2569,7 +2569,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) last_ackt = skb->tstamp; } tcp_dec_pcount_approx(&tp->fackets_out, skb); - tcp_packets_out_dec(tp, skb); + tp->packets_out -= tcp_skb_pcount(skb); tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); clear_all_retrans_hints(tp); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a3679174e78a..1d65ce1b68d7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1735,7 +1735,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m * it is better to underestimate fackets. */ tcp_dec_pcount_approx(&tp->fackets_out, next_skb); - tcp_packets_out_dec(tp, next_skb); + tp->packets_out -= tcp_skb_pcount(next_skb); sk_stream_free_skb(sk, next_skb); } } -- cgit v1.2.3 From 6ff03ac355cc6c10f7b1f44dd466d41213acebca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 24 Aug 2007 22:44:06 -0700 Subject: [TCP]: tcp_packets_out_inc to tcp_output.c (no callers elsewhere) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1d65ce1b68d7..a61a3e3082ae 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -61,6 +61,18 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +static inline void tcp_packets_out_inc(struct sock *sk, + const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + int orig = tp->packets_out; + + tp->packets_out += tcp_skb_pcount(skb); + if (!orig) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); +} + static void update_send_head(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3 From 6728e7dc3e577241f36921c720cfb4eb8f5aed1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 24 Aug 2007 22:53:26 -0700 Subject: [TCP]: Rename tcp_ack_packets_out -> tcp_rearm_rto MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only thing that tiny function does is rearming the RTO (if necessary), name it accordingly. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 593960d66ed9..0ead46f2bcd5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2424,8 +2424,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ - -static void tcp_ack_packets_out(struct sock *sk) +static void tcp_rearm_rto(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2581,7 +2580,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) = inet_csk(sk)->icsk_ca_ops; tcp_ack_update_rtt(sk, acked, seq_rtt); - tcp_ack_packets_out(sk); + tcp_rearm_rto(sk); if (tcp_is_reno(tp)) tcp_remove_reno_sacks(sk, pkts_acked); -- cgit v1.2.3 From 5b3c98821a8753239aefc1c217409aa3e5c90787 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 24 Aug 2007 22:54:44 -0700 Subject: [TCP]: Discard fuzzy SACK blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SACK processing code has been a sort of russian roulette as no validation of SACK blocks is previously attempted. Besides, it is not very clear what all kinds of broken SACK blocks really mean (e.g., one that has start and end sequence numbers reversed). So now close the roulette once and for all. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0ead46f2bcd5..a2364ebf8585 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1019,7 +1019,86 @@ static void tcp_update_reordering(struct sock *sk, const int metric, * for retransmitted and already SACKed segment -> reordering.. * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. + * + * SACK block validation. + * ---------------------- + * + * SACK block range validation checks that the received SACK block fits to + * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. + * Note that SND.UNA is not included to the range though being valid because + * it means that the receiver is rather inconsistent with itself (reports + * SACK reneging when it should advance SND.UNA). + * + * Implements also blockage to start_seq wrap-around. Problem lies in the + * fact that though start_seq (s) is before end_seq (i.e., not reversed), + * there's no guarantee that it will be before snd_nxt (n). The problem + * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt + * wrap (s_w): + * + * <- outs wnd -> <- wrapzone -> + * u e n u_w e_w s n_w + * | | | | | | | + * |<------------+------+----- TCP seqno space --------------+---------->| + * ...-- <2^31 ->| |<--------... + * ...---- >2^31 ------>| |<--------... + * + * Current code wouldn't be vulnerable but it's better still to discard such + * crazy SACK blocks. Doing this check for start_seq alone closes somewhat + * similar case (end_seq after snd_nxt wrap) as earlier reversed check in + * snd_nxt wrap -> snd_una region will then become "well defined", i.e., + * equal to the ideal case (infinite seqno space without wrap caused issues). + * + * With D-SACK the lower bound is extended to cover sequence space below + * SND.UNA down to undo_marker, which is the last point of interest. Yet + * again, DSACK block must not to go across snd_una (for the same reason as + * for the normal SACK blocks, explained above). But there all simplicity + * ends, TCP might receive valid D-SACKs below that. As long as they reside + * fully below undo_marker they do not affect behavior in anyway and can + * therefore be safely ignored. In rare cases (which are more or less + * theoretical ones), the D-SACK will nicely cross that boundary due to skb + * fragmentation and packet reordering past skb's retransmission. To consider + * them correctly, the acceptable range must be extended even more though + * the exact amount is rather hard to quantify. However, tp->max_window can + * be used as an exaggerated estimate. */ +static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, + u32 start_seq, u32 end_seq) +{ + /* Too far in future, or reversed (interpretation is ambiguous) */ + if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) + return 0; + + /* Nasty start_seq wrap-around check (see comments above) */ + if (!before(start_seq, tp->snd_nxt)) + return 0; + + /* In outstanding window? ...This is valid exit for DSACKs too. + * start_seq == snd_una is non-sensical (see comments above) + */ + if (after(start_seq, tp->snd_una)) + return 1; + + if (!is_dsack || !tp->undo_marker) + return 0; + + /* ...Then it's D-SACK, and must reside below snd_una completely */ + if (!after(end_seq, tp->snd_una)) + return 0; + + if (!before(start_seq, tp->undo_marker)) + return 1; + + /* Too old */ + if (!after(end_seq, tp->undo_marker)) + return 0; + + /* Undo_marker boundary crossing (overestimates a lot). Known already: + * start_seq < undo_marker and end_seq >= undo_marker. + */ + return !before(start_seq, end_seq - tp->max_window); +} + + static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) @@ -1161,6 +1240,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int fack_count; int dup_sack = (found_dup_sack && (i == first_sack_index)); + if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) + continue; + skb = cached_skb; fack_count = cached_fack_count; -- cgit v1.2.3 From 18f02545a9a16c9a89778b91a162ad16d510bb32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 24 Aug 2007 22:55:52 -0700 Subject: [TCP] MIB: Add counters for discarded SACK blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In DSACK case, some events are not extraordinary, such as packet duplication generated DSACK. They can arrive easily below snd_una when undo_marker is not set (TCP being in CA_Open), counting such DSACKs amoung SACK discards will likely just mislead if they occur in some scenario when there are other problems as well. Similarly, excessively delayed packets could cause "normal" DSACKs. Therefore, separate counters are allocated for DSACK events. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/proc.c | 3 +++ net/ipv4/tcp_input.c | 10 +++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3b690cf2a4ee..986d1c83a000 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -244,6 +244,9 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER), SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED), SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES), + SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD), + SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD), + SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a2364ebf8585..f9e4d7ad68b7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1240,8 +1240,16 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int fack_count; int dup_sack = (found_dup_sack && (i == first_sack_index)); - if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) + if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) { + if (dup_sack) { + if (!tp->undo_marker) + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); + else + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD); + } else + NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); continue; + } skb = cached_skb; fack_count = cached_fack_count; -- cgit v1.2.3 From 356f89e12e301376f26795643f3b5931c81c9cd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 24 Aug 2007 23:00:31 -0700 Subject: [NET] Cleanup: DIV_ROUND_UP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a61a3e3082ae..d65d17bb2a09 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -646,11 +646,7 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned skb_shinfo(skb)->gso_size = 0; skb_shinfo(skb)->gso_type = 0; } else { - unsigned int factor; - - factor = skb->len + (mss_now - 1); - factor /= mss_now; - skb_shinfo(skb)->gso_segs = factor; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss_now); skb_shinfo(skb)->gso_size = mss_now; skb_shinfo(skb)->gso_type = sk->sk_gso_type; } -- cgit v1.2.3 From 32c1da70810017a98aa6c431a5494a302b6b9a30 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 24 Aug 2007 23:09:41 -0700 Subject: [UDP]: Randomize port selection. This patch causes UDP port allocation to be randomized like TCP. The earlier code would always choose same port (ie first empty list). Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/udp.c | 84 ++++++++++++++++++++++++++++------------------------- net/ipv4/udp_impl.h | 2 +- net/ipv4/udplite.c | 3 +- 3 files changed, 47 insertions(+), 42 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 69d4bd10f9c6..a581b543bff7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -113,9 +113,8 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly; struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); -static int udp_port_rover; - -static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) +static inline int __udp_lib_lport_inuse(__u16 num, + const struct hlist_head udptable[]) { struct sock *sk; struct hlist_node *node; @@ -132,11 +131,10 @@ static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) * @sk: socket struct in question * @snum: port number to look up * @udptable: hash list table, must be of UDP_HTABLE_SIZE - * @port_rover: pointer to record of last unallocated port * @saddr_comp: AF-dependent comparison of bound local IP addresses */ int __udp_lib_get_port(struct sock *sk, unsigned short snum, - struct hlist_head udptable[], int *port_rover, + struct hlist_head udptable[], int (*saddr_comp)(const struct sock *sk1, const struct sock *sk2 ) ) { @@ -146,49 +144,56 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, int error = 1; write_lock_bh(&udp_hash_lock); - if (snum == 0) { - int best_size_so_far, best, result, i; - - if (*port_rover > sysctl_local_port_range[1] || - *port_rover < sysctl_local_port_range[0]) - *port_rover = sysctl_local_port_range[0]; - best_size_so_far = 32767; - best = result = *port_rover; - for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { - int size; - - head = &udptable[result & (UDP_HTABLE_SIZE - 1)]; - if (hlist_empty(head)) { - if (result > sysctl_local_port_range[1]) - result = sysctl_local_port_range[0] + - ((result - sysctl_local_port_range[0]) & - (UDP_HTABLE_SIZE - 1)); + + if (!snum) { + int i; + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + unsigned rover, best, best_size_so_far; + + best_size_so_far = UINT_MAX; + best = rover = net_random() % (high - low) + low; + + /* 1st pass: look for empty (or shortest) hash chain */ + for (i = 0; i < UDP_HTABLE_SIZE; i++) { + int size = 0; + + head = &udptable[rover & (UDP_HTABLE_SIZE - 1)]; + if (hlist_empty(head)) goto gotit; - } - size = 0; + sk_for_each(sk2, node, head) { if (++size >= best_size_so_far) goto next; } best_size_so_far = size; - best = result; + best = rover; next: - ; + /* fold back if end of range */ + if (++rover > high) + rover = low + ((rover - low) + & (UDP_HTABLE_SIZE - 1)); + + } - result = best; - for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; - i++, result += UDP_HTABLE_SIZE) { - if (result > sysctl_local_port_range[1]) - result = sysctl_local_port_range[0] - + ((result - sysctl_local_port_range[0]) & - (UDP_HTABLE_SIZE - 1)); - if (! __udp_lib_lport_inuse(result, udptable)) - break; + + /* 2nd pass: find hole in shortest hash chain */ + rover = best; + for (i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++) { + if (! __udp_lib_lport_inuse(rover, udptable)) + goto gotit; + rover += UDP_HTABLE_SIZE; + if (rover > high) + rover = low + ((rover - low) + & (UDP_HTABLE_SIZE - 1)); } - if (i >= (1 << 16) / UDP_HTABLE_SIZE) - goto fail; + + + /* All ports in use! */ + goto fail; + gotit: - *port_rover = snum = result; + snum = rover; } else { head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; @@ -201,6 +206,7 @@ gotit: (*saddr_comp)(sk, sk2) ) goto fail; } + inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { @@ -217,7 +223,7 @@ fail: int udp_get_port(struct sock *sk, unsigned short snum, int (*scmp)(const struct sock *, const struct sock *)) { - return __udp_lib_get_port(sk, snum, udp_hash, &udp_port_rover, scmp); + return __udp_lib_get_port(sk, snum, udp_hash, scmp); } int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 820a477cfaa6..6c55828e41ba 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -9,7 +9,7 @@ extern int __udp4_lib_rcv(struct sk_buff *, struct hlist_head [], int ); extern void __udp4_lib_err(struct sk_buff *, u32, struct hlist_head []); extern int __udp_lib_get_port(struct sock *sk, unsigned short snum, - struct hlist_head udptable[], int *port_rover, + struct hlist_head udptable[], int (*)(const struct sock*,const struct sock*)); extern int ipv4_rcv_saddr_equal(const struct sock *, const struct sock *); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index f34fd686a8f1..94977205abb4 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -16,12 +16,11 @@ DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics) __read_mostly; struct hlist_head udplite_hash[UDP_HTABLE_SIZE]; -static int udplite_port_rover; int udplite_get_port(struct sock *sk, unsigned short p, int (*c)(const struct sock *, const struct sock *)) { - return __udp_lib_get_port(sk, p, udplite_hash, &udplite_port_rover, c); + return __udp_lib_get_port(sk, p, udplite_hash, c); } static int udplite_v4_get_port(struct sock *sk, unsigned short snum) -- cgit v1.2.3 From 3b26a9a655ee73a87071a9f6a1fdd5311e31d7c9 Mon Sep 17 00:00:00 2001 From: Masahide NAKAMURA Date: Fri, 24 Aug 2007 23:33:01 -0700 Subject: [IPV4] IPSEC: Omit redirect for tunnelled packet. IPv4 IPsec tunnel gateway incorrectly sends redirect to sender if it is onlink host when network device the IPsec tunnelled packet is arrived is the same as the one the decapsulated packet is sent. With this patch, it omits to send the redirect when the forwarding skbuff carries secpath, since such skbuff should be assumed as a decapsulated packet from IPsec tunnel by own. Request for comments: Alternatively we'd have another way to change net/ipv4/route.c (__mkroute_input) to use RTCF_DOREDIRECT flag unless skbuff has no secpath. It is better than this patch at performance point of view because IPv4 redirect judgement is done at routing slow-path. However, it should be taken care of resource changes between SAD(XFRM states) and routing table. In other words, When IPv4 SAD is changed does the related routing entry go to its slow-path? If not, it is reasonable to apply this patch. Signed-off-by: Masahide NAKAMURA Signed-off-by: David S. Miller --- net/ipv4/ip_forward.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 8c95cf09f87a..afbf938836f5 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -105,7 +105,7 @@ int ip_forward(struct sk_buff *skb) * We now generate an ICMP HOST REDIRECT giving the route * we calculated. */ - if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr) + if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb->sp) ip_rt_send_redirect(skb); skb->priority = rt_tos2priority(iph->tos); -- cgit v1.2.3 From 172589ccdde41b59861c92c4a971b95514ef24e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 28 Aug 2007 15:50:33 -0700 Subject: [NET]: DIV_ROUND_UP cleanup (part two) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hopefully captured all single statement cases under net/. I'm not too sure if there is some policy about #includes that are "guaranteed" (ie., in the current tree) to be available through some other #included header, so I just added linux/kernel.h to each changed file that didn't #include it previously. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 5 +++-- net/ipv4/inet_timewait_sock.c | 4 ++-- net/ipv4/tcp.c | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index def007ec1d6f..686ddd62f71a 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -11,6 +11,7 @@ * 2 of the License, or (at your option) any later version. */ +#include #include #include #include @@ -112,7 +113,7 @@ static int inet_csk_diag_fill(struct sock *sk, } #endif -#define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ +#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) if (icsk->icsk_pending == ICSK_TIME_RETRANS) { r->idiag_timer = 1; @@ -190,7 +191,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, r->id.idiag_dst[0] = tw->tw_daddr; r->idiag_state = tw->tw_substate; r->idiag_timer = 3; - r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ; + r->idiag_expires = DIV_ROUND_UP(tmo * 1000, HZ); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2586df09b9b6..4e189e28f306 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -8,7 +8,7 @@ * From code orinally in TCP */ - +#include #include #include #include @@ -292,7 +292,7 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, if (timeo >= timewait_len) { slot = INET_TWDR_TWKILL_SLOTS - 1; } else { - slot = (timeo + twdr->period - 1) / twdr->period; + slot = DIV_ROUND_UP(timeo, twdr->period); if (slot >= INET_TWDR_TWKILL_SLOTS) slot = INET_TWDR_TWKILL_SLOTS - 1; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index aff31427f525..18c64c74f6d5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -247,6 +247,7 @@ * TCP_CLOSE socket is finished */ +#include #include #include #include @@ -2210,7 +2211,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) goto out; mss = skb_shinfo(skb)->gso_size; - skb_shinfo(skb)->gso_segs = (skb->len + mss - 1) / mss; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); segs = NULL; goto out; -- cgit v1.2.3 From 457c4cbc5a3dde259d2a1f15d5f9785290397267 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 12 Sep 2007 12:01:34 +0200 Subject: [NET]: Make /proc/net per network namespace This patch makes /proc/net per network namespace. It modifies the global variables proc_net and proc_net_stat to be per network namespace. The proc_net file helpers are modified to take a network namespace argument, and all of their callers are fixed to pass &init_net for that argument. This ensures that all of the /proc/net files are only visible and usable in the initial network namespace until the code behind them has been updated to be handle multiple network namespaces. Making /proc/net per namespace is necessary as at least some files in /proc/net depend upon the set of network devices which is per network namespace, and even more files in /proc/net have contents that are relevant to a single network namespace. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/arp.c | 3 ++- net/ipv4/fib_hash.c | 5 +++-- net/ipv4/fib_trie.c | 17 +++++++++-------- net/ipv4/igmp.c | 5 +++-- net/ipv4/ipconfig.c | 3 ++- net/ipv4/ipmr.c | 5 +++-- net/ipv4/ipvs/ip_vs_app.c | 5 +++-- net/ipv4/ipvs/ip_vs_conn.c | 5 +++-- net/ipv4/ipvs/ip_vs_ctl.c | 9 +++++---- net/ipv4/ipvs/ip_vs_lblcr.c | 5 +++-- net/ipv4/netfilter/ip_queue.c | 8 ++++---- net/ipv4/netfilter/ipt_CLUSTERIP.c | 3 ++- net/ipv4/netfilter/ipt_recent.c | 5 +++-- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | 17 +++++++++-------- net/ipv4/proc.c | 11 ++++++----- net/ipv4/raw.c | 5 +++-- net/ipv4/route.c | 7 ++++--- net/ipv4/tcp_ipv4.c | 5 +++-- net/ipv4/tcp_probe.c | 7 ++++--- net/ipv4/udp.c | 5 +++-- 20 files changed, 77 insertions(+), 58 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 9ab9d534fbac..78dd3443016c 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -103,6 +103,7 @@ #include #endif +#include #include #include #include @@ -1400,7 +1401,7 @@ static const struct file_operations arp_seq_fops = { static int __init arp_proc_init(void) { - if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops)) + if (!proc_net_fops_create(&init_net, "arp", S_IRUGO, &arp_seq_fops)) return -ENOMEM; return 0; } diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 9ad1d9ff9ce7..9fafbeea8fe6 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -1068,13 +1069,13 @@ static const struct file_operations fib_seq_fops = { int __init fib_proc_init(void) { - if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops)) + if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_seq_fops)) return -ENOMEM; return 0; } void __init fib_proc_exit(void) { - proc_net_remove("route"); + proc_net_remove(&init_net, "route"); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 52b2891c63b7..be34bd556d58 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include @@ -2530,30 +2531,30 @@ static const struct file_operations fib_route_fops = { int __init fib_proc_init(void) { - if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops)) + if (!proc_net_fops_create(&init_net, "fib_trie", S_IRUGO, &fib_trie_fops)) goto out1; - if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops)) + if (!proc_net_fops_create(&init_net, "fib_triestat", S_IRUGO, &fib_triestat_fops)) goto out2; - if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops)) + if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_route_fops)) goto out3; return 0; out3: - proc_net_remove("fib_triestat"); + proc_net_remove(&init_net, "fib_triestat"); out2: - proc_net_remove("fib_trie"); + proc_net_remove(&init_net, "fib_trie"); out1: return -ENOMEM; } void __init fib_proc_exit(void) { - proc_net_remove("fib_trie"); - proc_net_remove("fib_triestat"); - proc_net_remove("route"); + proc_net_remove(&init_net, "fib_trie"); + proc_net_remove(&init_net, "fib_triestat"); + proc_net_remove(&init_net, "route"); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a646409c2d06..d78599a9dbd5 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -91,6 +91,7 @@ #include #include +#include #include #include #include @@ -2613,8 +2614,8 @@ static const struct file_operations igmp_mcf_seq_fops = { int __init igmp_mc_proc_init(void) { - proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops); - proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops); + proc_net_fops_create(&init_net, "igmp", S_IRUGO, &igmp_mc_seq_fops); + proc_net_fops_create(&init_net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops); return 0; } #endif diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index c5b247077539..5ae4849878a3 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -1253,7 +1254,7 @@ static int __init ip_auto_config(void) __be32 addr; #ifdef CONFIG_PROC_FS - proc_net_fops_create("pnp", S_IRUGO, &pnp_seq_fops); + proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); #endif /* CONFIG_PROC_FS */ if (!ic_enable) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 7003cc1b7fe2..35683e1a42e8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -1922,7 +1923,7 @@ void __init ip_mr_init(void) ipmr_expire_timer.function=ipmr_expire_process; register_netdevice_notifier(&ip_mr_notifier); #ifdef CONFIG_PROC_FS - proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops); - proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops); + proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops); + proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops); #endif } diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index 8d6901d4e94f..341474eefa55 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -616,12 +617,12 @@ int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, int ip_vs_app_init(void) { /* we will replace it with proc_net_ipvs_create() soon */ - proc_net_fops_create("ip_vs_app", 0, &ip_vs_app_fops); + proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); return 0; } void ip_vs_app_cleanup(void) { - proc_net_remove("ip_vs_app"); + proc_net_remove(&init_net, "ip_vs_app"); } diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index d612a6a5d957..4b702f708d30 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -35,6 +35,7 @@ #include #include +#include #include @@ -922,7 +923,7 @@ int ip_vs_conn_init(void) rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); } - proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops); + proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); /* calculate the random value for connection hash */ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); @@ -938,6 +939,6 @@ void ip_vs_conn_cleanup(void) /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); - proc_net_remove("ip_vs_conn"); + proc_net_remove(&init_net, "ip_vs_conn"); vfree(ip_vs_conn_tab); } diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index f656d41d8d41..61d023d58b5d 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -2356,8 +2357,8 @@ int ip_vs_control_init(void) return ret; } - proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops); + proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); + proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); sysctl_header = register_sysctl_table(vs_root_table); @@ -2390,8 +2391,8 @@ void ip_vs_control_cleanup(void) cancel_work_sync(&defense_work.work); ip_vs_kill_estimator(&ip_vs_stats); unregister_sysctl_table(sysctl_header); - proc_net_remove("ip_vs_stats"); - proc_net_remove("ip_vs"); + proc_net_remove(&init_net, "ip_vs_stats"); + proc_net_remove(&init_net, "ip_vs"); nf_unregister_sockopt(&ip_vs_sockopts); LeaveFunction(2); } diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 6225acac7a3b..6a1fec416eaf 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -50,6 +50,7 @@ #include /* for proc_net_create/proc_net_remove */ #include +#include #include @@ -843,7 +844,7 @@ static int __init ip_vs_lblcr_init(void) INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); sysctl_header = register_sysctl_table(lblcr_root_table); #ifdef CONFIG_IP_VS_LBLCR_DEBUG - proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo); + proc_net_create(&init_net, "ip_vs_lblcr", 0, ip_vs_lblcr_getinfo); #endif return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); } @@ -852,7 +853,7 @@ static int __init ip_vs_lblcr_init(void) static void __exit ip_vs_lblcr_cleanup(void) { #ifdef CONFIG_IP_VS_LBLCR_DEBUG - proc_net_remove("ip_vs_lblcr"); + proc_net_remove(&init_net, "ip_vs_lblcr"); #endif unregister_sysctl_table(sysctl_header); unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 702d94db19b9..cb5e61a1d7ab 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -674,7 +675,7 @@ static int __init ip_queue_init(void) goto cleanup_netlink_notifier; } - proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info); + proc = proc_net_create(&init_net, IPQ_PROC_FS_NAME, 0, ipq_get_info); if (proc) proc->owner = THIS_MODULE; else { @@ -695,8 +696,7 @@ static int __init ip_queue_init(void) cleanup_sysctl: unregister_sysctl_table(ipq_sysctl_header); unregister_netdevice_notifier(&ipq_dev_notifier); - proc_net_remove(IPQ_PROC_FS_NAME); - + proc_net_remove(&init_net, IPQ_PROC_FS_NAME); cleanup_ipqnl: sock_release(ipqnl->sk_socket); mutex_lock(&ipqnl_mutex); @@ -715,7 +715,7 @@ static void __exit ip_queue_fini(void) unregister_sysctl_table(ipq_sysctl_header); unregister_netdevice_notifier(&ipq_dev_notifier); - proc_net_remove(IPQ_PROC_FS_NAME); + proc_net_remove(&init_net, IPQ_PROC_FS_NAME); sock_release(ipqnl->sk_socket); mutex_lock(&ipqnl_mutex); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 69bd362b5fa2..50fc9e009fe4 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #define CLUSTERIP_VERSION "0.8" @@ -726,7 +727,7 @@ static int __init ipt_clusterip_init(void) goto cleanup_target; #ifdef CONFIG_PROC_FS - clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", proc_net); + clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net); if (!clusterip_procdir) { printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n"); ret = -ENOMEM; diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 6d0c0f7364ad..db2a79889f9a 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -487,7 +488,7 @@ static int __init ipt_recent_init(void) #ifdef CONFIG_PROC_FS if (err) return err; - proc_dir = proc_mkdir("ipt_recent", proc_net); + proc_dir = proc_mkdir("ipt_recent", init_net.proc_net); if (proc_dir == NULL) { xt_unregister_match(&recent_match); err = -ENOMEM; @@ -501,7 +502,7 @@ static void __exit ipt_recent_exit(void) BUG_ON(!list_empty(&tables)); xt_unregister_match(&recent_match); #ifdef CONFIG_PROC_FS - remove_proc_entry("ipt_recent", proc_net); + remove_proc_entry("ipt_recent", init_net.proc_net); #endif } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index b3dd5de9a258..a5ae2eabf0f3 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -408,16 +409,16 @@ int __init nf_conntrack_ipv4_compat_init(void) { struct proc_dir_entry *proc, *proc_exp, *proc_stat; - proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops); + proc = proc_net_fops_create(&init_net, "ip_conntrack", 0440, &ct_file_ops); if (!proc) goto err1; - proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440, + proc_exp = proc_net_fops_create(&init_net, "ip_conntrack_expect", 0440, &ip_exp_file_ops); if (!proc_exp) goto err2; - proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); + proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, init_net.proc_net_stat); if (!proc_stat) goto err3; @@ -427,16 +428,16 @@ int __init nf_conntrack_ipv4_compat_init(void) return 0; err3: - proc_net_remove("ip_conntrack_expect"); + proc_net_remove(&init_net, "ip_conntrack_expect"); err2: - proc_net_remove("ip_conntrack"); + proc_net_remove(&init_net, "ip_conntrack"); err1: return -ENOMEM; } void __exit nf_conntrack_ipv4_compat_fini(void) { - remove_proc_entry("ip_conntrack", proc_net_stat); - proc_net_remove("ip_conntrack_expect"); - proc_net_remove("ip_conntrack"); + remove_proc_entry("ip_conntrack", init_net.proc_net_stat); + proc_net_remove(&init_net, "ip_conntrack_expect"); + proc_net_remove(&init_net, "ip_conntrack"); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 986d1c83a000..95a8f8f2de71 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -34,6 +34,7 @@ * 2 of the License, or (at your option) any later version. */ #include +#include #include #include #include @@ -383,20 +384,20 @@ int __init ip_misc_proc_init(void) { int rc = 0; - if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops)) + if (!proc_net_fops_create(&init_net, "netstat", S_IRUGO, &netstat_seq_fops)) goto out_netstat; - if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops)) + if (!proc_net_fops_create(&init_net, "snmp", S_IRUGO, &snmp_seq_fops)) goto out_snmp; - if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops)) + if (!proc_net_fops_create(&init_net, "sockstat", S_IRUGO, &sockstat_seq_fops)) goto out_sockstat; out: return rc; out_sockstat: - proc_net_remove("snmp"); + proc_net_remove(&init_net, "snmp"); out_snmp: - proc_net_remove("netstat"); + proc_net_remove(&init_net, "netstat"); out_netstat: rc = -ENOMEM; goto out; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index c6d71526f625..216e01b0f44a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -928,13 +929,13 @@ static const struct file_operations raw_seq_fops = { int __init raw_proc_init(void) { - if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops)) + if (!proc_net_fops_create(&init_net, "raw", S_IRUGO, &raw_seq_fops)) return -ENOMEM; return 0; } void __init raw_proc_exit(void) { - proc_net_remove("raw"); + proc_net_remove(&init_net, "raw"); } #endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c7ca94bd152c..efd2a9202d68 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include #include @@ -3011,15 +3012,15 @@ int __init ip_rt_init(void) #ifdef CONFIG_PROC_FS { struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ - if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) || + if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) || !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, - proc_net_stat))) { + init_net.proc_net_stat))) { return -ENOMEM; } rtstat_pde->proc_fops = &rt_cpu_seq_fops; } #ifdef CONFIG_NET_CLS_ROUTE - create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL); + create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL); #endif #endif #ifdef CONFIG_XFRM diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e089a978e128..8855e640e958 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -62,6 +62,7 @@ #include #include +#include #include #include #include @@ -2249,7 +2250,7 @@ int tcp_proc_register(struct tcp_seq_afinfo *afinfo) afinfo->seq_fops->llseek = seq_lseek; afinfo->seq_fops->release = seq_release_private; - p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); + p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops); if (p) p->data = afinfo; else @@ -2261,7 +2262,7 @@ void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo) { if (!afinfo) return; - proc_net_remove(afinfo->name); + proc_net_remove(&init_net, afinfo->name); memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); } diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index b76398d1b454..87dd5bff315f 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -228,7 +229,7 @@ static __init int tcpprobe_init(void) if (!tcp_probe.log) goto err0; - if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops)) + if (!proc_net_fops_create(&init_net, procname, S_IRUSR, &tcpprobe_fops)) goto err0; ret = register_jprobe(&tcp_jprobe); @@ -238,7 +239,7 @@ static __init int tcpprobe_init(void) pr_info("TCP probe registered (port=%d)\n", port); return 0; err1: - proc_net_remove(procname); + proc_net_remove(&init_net, procname); err0: kfree(tcp_probe.log); return ret; @@ -247,7 +248,7 @@ module_init(tcpprobe_init); static __exit void tcpprobe_exit(void) { - proc_net_remove(procname); + proc_net_remove(&init_net, procname); unregister_jprobe(&tcp_jprobe); kfree(tcp_probe.log); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a581b543bff7..ef4d901ee9ad 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -98,6 +98,7 @@ #include #include #include +#include #include #include #include @@ -1566,7 +1567,7 @@ int udp_proc_register(struct udp_seq_afinfo *afinfo) afinfo->seq_fops->llseek = seq_lseek; afinfo->seq_fops->release = seq_release_private; - p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); + p = proc_net_fops_create(&init_net, afinfo->name, S_IRUGO, afinfo->seq_fops); if (p) p->data = afinfo; else @@ -1578,7 +1579,7 @@ void udp_proc_unregister(struct udp_seq_afinfo *afinfo) { if (!afinfo) return; - proc_net_remove(afinfo->name); + proc_net_remove(&init_net, afinfo->name); memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); } -- cgit v1.2.3 From 1b8d7ae42d02e483ad94035cca851e4f7fbecb40 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 8 Oct 2007 23:24:22 -0700 Subject: [NET]: Make socket creation namespace safe. This patch passes in the namespace a new socket should be created in and has the socket code do the appropriate reference counting. By virtue of this all socket create methods are touched. In addition the socket create methods are modified so that they will fail if you attempt to create a socket in a non-default network namespace. Failing if we attempt to create a socket outside of the default network namespace ensures that as we incrementally make the network stack network namespace aware we will not export functionality that someone has not audited and made certain is network namespace safe. Allowing us to partially enable network namespaces before all of the exotic protocols are supported. Any protocol layers I have missed will fail to compile because I now pass an extra parameter into the socket creation code. [ Integrated AF_IUCV build fixes from Andrew Morton... -DaveM ] Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e68103475cca..110a19edacc8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -241,7 +241,7 @@ EXPORT_SYMBOL(build_ehash_secret); * Create an inet socket. */ -static int inet_create(struct socket *sock, int protocol) +static int inet_create(struct net *net, struct socket *sock, int protocol) { struct sock *sk; struct list_head *p; @@ -253,6 +253,9 @@ static int inet_create(struct socket *sock, int protocol) int try_loading_module = 0; int err; + if (net != &init_net) + return -EAFNOSUPPORT; + if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM && !inet_ehash_secret) @@ -320,7 +323,7 @@ lookup_protocol: BUG_TRAP(answer_prot->slab != NULL); err = -ENOBUFS; - sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1); + sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, 1); if (sk == NULL) goto out; -- cgit v1.2.3 From e730c15519d09ea528b4d2f1103681fa5937c0e6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 17 Sep 2007 11:53:39 -0700 Subject: [NET]: Make packet reception network namespace safe This patch modifies every packet receive function registered with dev_add_pack() to drop packets if they are not from the initial network namespace. This should ensure that the various network stacks do not receive packets in a anything but the initial network namespace until the code has been converted and is ready for them. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/arp.c | 3 +++ net/ipv4/ip_input.c | 3 +++ net/ipv4/ipconfig.c | 6 ++++++ 3 files changed, 12 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 78dd3443016c..bde129708e22 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -932,6 +932,9 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, { struct arphdr *arp; + if (dev->nd_net != &init_net) + goto freeskb; + /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ if (!pskb_may_pull(skb, (sizeof(struct arphdr) + (2 * dev->addr_len) + diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 97069399d864..41d8964591e7 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -382,6 +382,9 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct iphdr *iph; u32 len; + if (dev->nd_net != &init_net) + goto drop; + /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 5ae4849878a3..08ff623371f0 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -426,6 +426,9 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt unsigned char *sha, *tha; /* s for "source", t for "target" */ struct ic_device *d; + if (dev->nd_net != &init_net) + goto drop; + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) return NET_RX_DROP; @@ -835,6 +838,9 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str struct ic_device *d; int len, ext_len; + if (dev->nd_net != &init_net) + goto drop; + /* Perform verifications before taking the lock. */ if (skb->pkt_type == PACKET_OTHERHOST) goto drop; -- cgit v1.2.3 From e9dc86534051b78e41e5b746cccc291b57a3a311 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 12 Sep 2007 13:02:17 +0200 Subject: [NET]: Make device event notification network namespace safe Every user of the network device notifiers is either a protocol stack or a pseudo device. If a protocol stack that does not have support for multiple network namespaces receives an event for a device that is not in the initial network namespace it quite possibly can get confused and do the wrong thing. To avoid problems until all of the protocol stacks are converted this patch modifies all netdev event handlers to ignore events on devices that are not in the initial network namespace. As the rest of the code is made network namespace aware these checks can be removed. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/arp.c | 3 +++ net/ipv4/devinet.c | 3 +++ net/ipv4/fib_frontend.c | 3 +++ net/ipv4/ipmr.c | 7 ++++++- net/ipv4/netfilter/ip_queue.c | 3 +++ net/ipv4/netfilter/ipt_MASQUERADE.c | 3 +++ 6 files changed, 21 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index bde129708e22..a11e7a5c1da4 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1205,6 +1205,9 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, vo { struct net_device *dev = ptr; + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5dbe5803b7d5..c5eb1a29a5cf 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1051,6 +1051,9 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + ASSERT_RTNL(); if (!in_dev) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index eff6bce453ee..cefb55ec3d62 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -860,6 +860,9 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, 2); return NOTIFY_DONE; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 35683e1a42e8..036598835c63 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1083,13 +1083,18 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { + struct net_device *dev = ptr; struct vif_device *v; int ct; + + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; v=&vif_table[0]; for (ct=0;ctdev==ptr) + if (v->dev==dev) vif_delete(ct); } return NOTIFY_DONE; diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index cb5e61a1d7ab..d91856097f25 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -557,6 +557,9 @@ ipq_rcv_dev_event(struct notifier_block *this, { struct net_device *dev = ptr; + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) ipq_dev_drop(dev->ifindex); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 7c4e4be7c8b3..3e0b562b2db7 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -125,6 +125,9 @@ static int masq_device_event(struct notifier_block *this, { const struct net_device *dev = ptr; + if (dev->nd_net != &init_net) + return NOTIFY_DONE; + if (event == NETDEV_DOWN) { /* Device was downed. Search entire table for conntracks which were associated with that device, -- cgit v1.2.3 From b4b510290b056b86611757ce1175a230f1080f53 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 12 Sep 2007 13:05:38 +0200 Subject: [NET]: Support multiple network namespaces with netlink Each netlink socket will live in exactly one network namespace, this includes the controlling kernel sockets. This patch updates all of the existing netlink protocols to only support the initial network namespace. Request by clients in other namespaces will get -ECONREFUSED. As they would if the kernel did not have the support for that netlink protocol compiled in. As each netlink protocol is updated to be multiple network namespace safe it can register multiple kernel sockets to acquire a presence in the rest of the network namespaces. The implementation in af_netlink is a simple filter implementation at hash table insertion and hash table look up time. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 4 ++-- net/ipv4/inet_diag.c | 4 ++-- net/ipv4/netfilter/ip_queue.c | 6 +++--- net/ipv4/netfilter/ipt_ULOG.c | 3 ++- 4 files changed, 9 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cefb55ec3d62..140bf7a8d877 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -816,8 +816,8 @@ static void nl_fib_input(struct sock *sk, int len) static void nl_fib_lookup_init(void) { - netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, NULL, - THIS_MODULE); + netlink_kernel_create(&init_net, NETLINK_FIB_LOOKUP, 0, nl_fib_input, + NULL, THIS_MODULE); } static void fib_disable_ip(struct net_device *dev, int force) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 686ddd62f71a..031cc4856b49 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -897,8 +897,8 @@ static int __init inet_diag_init(void) if (!inet_diag_table) goto out; - idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv, - NULL, THIS_MODULE); + idiagnl = netlink_kernel_create(&init_net, NETLINK_INET_DIAG, 0, + inet_diag_rcv, NULL, THIS_MODULE); if (idiagnl == NULL) goto out_free_table; err = 0; diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index d91856097f25..82fda92e6b97 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -579,7 +579,7 @@ ipq_rcv_nl_event(struct notifier_block *this, if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL && n->pid) { write_lock_bh(&queue_lock); - if (n->pid == peer_pid) + if ((n->net == &init_net) && (n->pid == peer_pid)) __ipq_reset(); write_unlock_bh(&queue_lock); } @@ -671,8 +671,8 @@ static int __init ip_queue_init(void) struct proc_dir_entry *proc; netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk, - NULL, THIS_MODULE); + ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0, + ipq_rcv_sk, NULL, THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 6ca43e4ca7e3..c636d6d63574 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -409,7 +409,8 @@ static int __init ipt_ulog_init(void) for (i = 0; i < ULOG_MAXNLGROUPS; i++) setup_timer(&ulog_buffers[i].timer, ulog_timer, i); - nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, + nflognl = netlink_kernel_create(&init_net, + NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, NULL, THIS_MODULE); if (!nflognl) return -ENOMEM; -- cgit v1.2.3 From 881d966b48b035ab3f3aeaae0f3d3f9b584f45b2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 17 Sep 2007 11:56:21 -0700 Subject: [NET]: Make the device list and device lookups per namespace. This patch makes most of the generic device layer network namespace safe. This patch makes dev_base_head a network namespace variable, and then it picks up a few associated variables. The functions: dev_getbyhwaddr dev_getfirsthwbytype dev_get_by_flags dev_get_by_name __dev_get_by_name dev_get_by_index __dev_get_by_index dev_ioctl dev_ethtool dev_load wireless_process_ioctl were modified to take a network namespace argument, and deal with it. vlan_ioctl_set and brioctl_set were modified so their hooks will receive a network namespace argument. So basically anthing in the core of the network stack that was affected to by the change of dev_base was modified to handle multiple network namespaces. The rest of the network stack was simply modified to explicitly use &init_net the initial network namespace. This can be fixed when those components of the network stack are modified to handle multiple network namespaces. For now the ifindex generator is left global. Fundametally ifindex numbers are per namespace, or else we will have corner case problems with migration when we get that far. At the same time there are assumptions in the network stack that the ifindex of a network device won't change. Making the ifindex number global seems a good compromise until the network stack can cope with ifindex changes when you change namespaces, and the like. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/arp.c | 4 ++-- net/ipv4/devinet.c | 18 +++++++++--------- net/ipv4/fib_frontend.c | 2 +- net/ipv4/fib_semantics.c | 4 ++-- net/ipv4/icmp.c | 2 +- net/ipv4/igmp.c | 4 ++-- net/ipv4/ip_fragment.c | 2 +- net/ipv4/ip_gre.c | 4 ++-- net/ipv4/ip_sockglue.c | 2 +- net/ipv4/ipconfig.c | 2 +- net/ipv4/ipip.c | 4 ++-- net/ipv4/ipmr.c | 4 ++-- net/ipv4/ipvs/ip_vs_sync.c | 10 +++++----- net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +- net/ipv4/route.c | 4 ++-- 15 files changed, 34 insertions(+), 34 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index a11e7a5c1da4..3a683006d761 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -981,7 +981,7 @@ static int arp_req_set(struct arpreq *r, struct net_device * dev) if (mask && mask != htonl(0xFFFFFFFF)) return -EINVAL; if (!dev && (r->arp_flags & ATF_COM)) { - dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data); + dev = dev_getbyhwaddr(&init_net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; } @@ -1169,7 +1169,7 @@ int arp_ioctl(unsigned int cmd, void __user *arg) rtnl_lock(); if (r.arp_dev[0]) { err = -ENODEV; - if ((dev = __dev_get_by_name(r.arp_dev)) == NULL) + if ((dev = __dev_get_by_name(&init_net, r.arp_dev)) == NULL) goto out; /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c5eb1a29a5cf..721b89b60963 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -420,7 +420,7 @@ struct in_device *inetdev_by_index(int ifindex) struct net_device *dev; struct in_device *in_dev = NULL; read_lock(&dev_base_lock); - dev = __dev_get_by_index(ifindex); + dev = __dev_get_by_index(&init_net, ifindex); if (dev) in_dev = in_dev_get(dev); read_unlock(&dev_base_lock); @@ -506,7 +506,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct nlmsghdr *nlh) goto errout; } - dev = __dev_get_by_index(ifm->ifa_index); + dev = __dev_get_by_index(&init_net, ifm->ifa_index); if (dev == NULL) { err = -ENODEV; goto errout; @@ -628,7 +628,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) *colon = 0; #ifdef CONFIG_KMOD - dev_load(ifr.ifr_name); + dev_load(&init_net, ifr.ifr_name); #endif switch (cmd) { @@ -669,7 +669,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg) rtnl_lock(); ret = -ENODEV; - if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL) + if ((dev = __dev_get_by_name(&init_net, ifr.ifr_name)) == NULL) goto done; if (colon) @@ -909,7 +909,7 @@ no_in_dev: */ read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(dev) { + for_each_netdev(&init_net, dev) { if ((in_dev = __in_dev_get_rcu(dev)) == NULL) continue; @@ -988,7 +988,7 @@ __be32 inet_confirm_addr(const struct net_device *dev, __be32 dst, __be32 local, read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(dev) { + for_each_netdev(&init_net, dev) { if ((in_dev = __in_dev_get_rcu(dev))) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) @@ -1185,7 +1185,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) s_ip_idx = ip_idx = cb->args[1]; idx = 0; - for_each_netdev(dev) { + for_each_netdev(&init_net, dev) { if (idx < s_idx) goto cont; if (idx > s_idx) @@ -1244,7 +1244,7 @@ static void devinet_copy_dflt_conf(int i) struct net_device *dev; read_lock(&dev_base_lock); - for_each_netdev(dev) { + for_each_netdev(&init_net, dev) { struct in_device *in_dev; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -1333,7 +1333,7 @@ void inet_forward_change(void) IPV4_DEVCONF_DFLT(FORWARDING) = on; read_lock(&dev_base_lock); - for_each_netdev(dev) { + for_each_netdev(&init_net, dev) { struct in_device *in_dev; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 140bf7a8d877..df17aab193b4 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -334,7 +334,7 @@ static int rtentry_to_fib_config(int cmd, struct rtentry *rt, colon = strchr(devname, ':'); if (colon) *colon = 0; - dev = __dev_get_by_name(devname); + dev = __dev_get_by_name(&init_net, devname); if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c434119deb52..d30fb68d5f4e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -533,7 +533,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, return -EINVAL; if (inet_addr_type(nh->nh_gw) != RTN_UNICAST) return -EINVAL; - if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL) + if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL) return -ENODEV; if (!(dev->flags&IFF_UP)) return -ENETDOWN; @@ -799,7 +799,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (nhs != 1 || nh->nh_gw) goto err_inval; nh->nh_scope = RT_SCOPE_NOWHERE; - nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif); + nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif); err = -ENODEV; if (nh->nh_dev == NULL) goto failure; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 02a899bec196..68a22670f597 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -517,7 +517,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct net_device *dev = NULL; if (rt->fl.iif && sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index(rt->fl.iif); + dev = dev_get_by_index(&init_net, rt->fl.iif); if (dev) { saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d78599a9dbd5..ad500a43b359 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2292,7 +2292,7 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; - for_each_netdev(state->dev) { + for_each_netdev(&init_net, state->dev) { struct in_device *in_dev; in_dev = in_dev_get(state->dev); if (!in_dev) @@ -2454,7 +2454,7 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) state->idev = NULL; state->im = NULL; - for_each_netdev(state->dev) { + for_each_netdev(&init_net, state->dev) { struct in_device *idev; idev = in_dev_get(state->dev); if (unlikely(idev == NULL)) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 0231bdcb2ab7..fabb86db763b 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -292,7 +292,7 @@ static void ip_expire(unsigned long arg) if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) { struct sk_buff *head = qp->fragments; /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if ((head->dev = dev_get_by_index(qp->iif)) != NULL) { + if ((head->dev = dev_get_by_index(&init_net, qp->iif)) != NULL) { icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); dev_put(head->dev); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5c14ed63e56c..3106225c5e50 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -262,7 +262,7 @@ static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int int i; for (i=1; i<100; i++) { sprintf(name, "gre%d", i); - if (__dev_get_by_name(name) == NULL) + if (__dev_get_by_name(&init_net, name) == NULL) break; } if (i==100) @@ -1196,7 +1196,7 @@ static int ipgre_tunnel_init(struct net_device *dev) } if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(tunnel->parms.link); + tdev = __dev_get_by_index(&init_net, tunnel->parms.link); if (tdev) { hlen = tdev->hard_header_len; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 6b420aedcdcf..b2b3053dfef7 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -602,7 +602,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, dev_put(dev); } } else - dev = __dev_get_by_index(mreq.imr_ifindex); + dev = __dev_get_by_index(&init_net, mreq.imr_ifindex); err = -EADDRNOTAVAIL; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 08ff623371f0..4303851749f6 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -193,7 +193,7 @@ static int __init ic_open_devs(void) if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0) printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev.name); - for_each_netdev(dev) { + for_each_netdev(&init_net, dev) { if (dev == &loopback_dev) continue; if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 396437242a1b..652bd86e33a3 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -225,7 +225,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c int i; for (i=1; i<100; i++) { sprintf(name, "tunl%d", i); - if (__dev_get_by_name(name) == NULL) + if (__dev_get_by_name(&init_net, name) == NULL) break; } if (i==100) @@ -822,7 +822,7 @@ static int ipip_tunnel_init(struct net_device *dev) } if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(tunnel->parms.link); + tdev = __dev_get_by_index(&init_net, tunnel->parms.link); if (tdev) { dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 036598835c63..b8b4b497fb57 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -125,7 +125,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) { struct net_device *dev; - dev = __dev_get_by_name("tunl0"); + dev = __dev_get_by_name(&init_net, "tunl0"); if (dev) { int err; @@ -149,7 +149,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) dev = NULL; - if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) { + if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) { dev->flags |= IFF_MULTICAST; in_dev = __in_dev_get_rtnl(dev); diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 356f067484e3..1960747f354c 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -387,7 +387,7 @@ static int set_mcast_if(struct sock *sk, char *ifname) struct net_device *dev; struct inet_sock *inet = inet_sk(sk); - if ((dev = __dev_get_by_name(ifname)) == NULL) + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) @@ -412,7 +412,7 @@ static int set_sync_mesg_maxlen(int sync_state) int num; if (sync_state == IP_VS_STATE_MASTER) { - if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL) + if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) return -ENODEV; num = (dev->mtu - sizeof(struct iphdr) - @@ -423,7 +423,7 @@ static int set_sync_mesg_maxlen(int sync_state) IP_VS_DBG(7, "setting the maximum length of sync sending " "message %d.\n", sync_send_mesg_maxlen); } else if (sync_state == IP_VS_STATE_BACKUP) { - if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL) + if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) return -ENODEV; sync_recv_mesg_maxlen = dev->mtu - @@ -451,7 +451,7 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) memset(&mreq, 0, sizeof(mreq)); memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - if ((dev = __dev_get_by_name(ifname)) == NULL) + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) return -EINVAL; @@ -472,7 +472,7 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) __be32 addr; struct sockaddr_in sin; - if ((dev = __dev_get_by_name(ifname)) == NULL) + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) return -ENODEV; addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 50fc9e009fe4..27f14e1ebd8b 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -401,7 +401,7 @@ checkentry(const char *tablename, return false; } - dev = dev_get_by_name(e->ip.iniface); + dev = dev_get_by_name(&init_net, e->ip.iniface); if (!dev) { printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface); return false; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index efd2a9202d68..396c631166a4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2213,7 +2213,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) if (oldflp->oif) { - dev_out = dev_get_by_index(oldflp->oif); + dev_out = dev_get_by_index(&init_net, oldflp->oif); err = -ENODEV; if (dev_out == NULL) goto out; @@ -2592,7 +2592,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (iif) { struct net_device *dev; - dev = __dev_get_by_index(iif); + dev = __dev_get_by_index(&init_net, iif); if (dev == NULL) { err = -ENODEV; goto errout_free; -- cgit v1.2.3 From 8f4c1f9b049df3be11090f1c2c4738700302acae Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 12 Sep 2007 14:44:36 +0200 Subject: [NETLINK]: Introduce nested and byteorder flag to netlink attribute This change allows the generic attribute interface to be used within the netfilter subsystem where this flag was initially introduced. The byte-order flag is yet unused, it's intended use is to allow automatic byte order convertions for all atomic types. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 +- net/ipv4/fib_semantics.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index df17aab193b4..f823ca34cb12 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -487,7 +487,7 @@ static int rtm_to_fib_config(struct sk_buff *skb, struct nlmsghdr *nlh, } nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) { - switch (attr->nla_type) { + switch (nla_type(attr)) { case RTA_DST: cfg->fc_dst = nla_get_be32(attr); break; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d30fb68d5f4e..1351a2617dce 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -743,7 +743,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) int remaining; nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla->nla_type; + int type = nla_type(nla); if (type) { if (type > RTAX_MAX) -- cgit v1.2.3 From 39c90ece7565f5c47110c2fa77409d7a9478bd5b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Sep 2007 10:55:54 -0700 Subject: [IPV4]: Convert rt_check_expire() from softirq processing to workqueue. On loaded/big hosts, rt_check_expire() if of litle use, because it generally breaks out of its main loop because of a jiffies change. It can take a long time (read : timer invocations) to actually scan the whole hash table, freeing unused entries. Converting it to use a workqueue instead of softirq is a nice move because we can allow rt_check_expire() to do the scan it is supposed to do, without hogging the CPU. This has an impact on the average number of entries in cache, reducing ram usage. Cache is more responsive to parameter changes (/proc/sys/net/ipv4/route/gc_timeout and /proc/sys/net/ipv4/route/gc_interval) Note: Maybe the default value of gc_interval (60 seconds) is too high, since this means we actually need 5 (300/60) invocations to scan the whole table. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 396c631166a4..006d6058a806 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include #include @@ -136,7 +137,8 @@ static unsigned long rt_deadline; #define RTprint(a...) printk(KERN_DEBUG a) static struct timer_list rt_flush_timer; -static struct timer_list rt_periodic_timer; +static void rt_check_expire(struct work_struct *work); +static DECLARE_DELAYED_WORK(expires_work, rt_check_expire); static struct timer_list rt_secret_timer; /* @@ -572,20 +574,19 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) (fl1->iif ^ fl2->iif)) == 0; } -/* This runs via a timer and thus is always in BH context. */ -static void rt_check_expire(unsigned long dummy) +static void rt_check_expire(struct work_struct *work) { static unsigned int rover; unsigned int i = rover, goal; struct rtable *rth, **rthp; - unsigned long now = jiffies; u64 mult; mult = ((u64)ip_rt_gc_interval) << rt_hash_log; if (ip_rt_gc_timeout > 1) do_div(mult, ip_rt_gc_timeout); goal = (unsigned int)mult; - if (goal > rt_hash_mask) goal = rt_hash_mask + 1; + if (goal > rt_hash_mask) + goal = rt_hash_mask + 1; for (; goal > 0; goal--) { unsigned long tmo = ip_rt_gc_timeout; @@ -594,11 +595,11 @@ static void rt_check_expire(unsigned long dummy) if (*rthp == 0) continue; - spin_lock(rt_hash_lock_addr(i)); + spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ - if (time_before_eq(now, rth->u.dst.expires)) { + if (time_before_eq(jiffies, rth->u.dst.expires)) { tmo >>= 1; rthp = &rth->u.dst.rt_next; continue; @@ -613,14 +614,10 @@ static void rt_check_expire(unsigned long dummy) *rthp = rth->u.dst.rt_next; rt_free(rth); } - spin_unlock(rt_hash_lock_addr(i)); - - /* Fallback loop breaker. */ - if (time_after(jiffies, now)) - break; + spin_unlock_bh(rt_hash_lock_addr(i)); } rover = i; - mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); + schedule_delayed_work(&expires_work, ip_rt_gc_interval); } /* This can run from both BH and non-BH contexts, the latter @@ -2993,17 +2990,14 @@ int __init ip_rt_init(void) init_timer(&rt_flush_timer); rt_flush_timer.function = rt_run_flush; - init_timer(&rt_periodic_timer); - rt_periodic_timer.function = rt_check_expire; init_timer(&rt_secret_timer); rt_secret_timer.function = rt_secret_rebuild; /* All the timers, started at system startup tend to synchronize. Perturb it a bit. */ - rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval + - ip_rt_gc_interval; - add_timer(&rt_periodic_timer); + schedule_delayed_work(&expires_work, + net_random() % ip_rt_gc_interval + ip_rt_gc_interval); rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + ip_rt_secret_interval; -- cgit v1.2.3 From 10d024c1b2fd58af8362670d7d6e5ae52fc33353 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Mon, 17 Sep 2007 13:11:17 -0700 Subject: [NET]: Nuke SET_MODULE_OWNER macro. It's been a useless no-op for long enough in 2.6 so I figured it's time to remove it. The number of people that could object because they're maintaining unified 2.4 and 2.6 drivers is probably rather small. [ Handled drivers added by netdev tree and some missed IRDA cases... -DaveM ] Signed-off-by: Ralf Baechle Signed-off-by: Jeff Garzik Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 1 - net/ipv4/ipip.c | 2 -- 2 files changed, 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 3106225c5e50..ffa9f1c9dcbb 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1132,7 +1132,6 @@ static int ipgre_close(struct net_device *dev) static void ipgre_tunnel_setup(struct net_device *dev) { - SET_MODULE_OWNER(dev); dev->uninit = ipgre_tunnel_uninit; dev->destructor = free_netdev; dev->hard_start_xmit = ipgre_tunnel_xmit; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 652bd86e33a3..5cd5bbe1379a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -237,7 +237,6 @@ static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int c return NULL; nt = netdev_priv(dev); - SET_MODULE_OWNER(dev); dev->init = ipip_tunnel_init; nt->parms = *parms; @@ -775,7 +774,6 @@ static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) static void ipip_tunnel_setup(struct net_device *dev) { - SET_MODULE_OWNER(dev); dev->uninit = ipip_tunnel_uninit; dev->hard_start_xmit = ipip_tunnel_xmit; dev->get_stats = ipip_tunnel_get_stats; -- cgit v1.2.3 From 63d804eade298208037045ab6728c933f2b6c27d Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Sat, 15 Sep 2007 21:45:13 -0700 Subject: [CIPSO]: remove duplicated code in the cipso_v4_*_getattr() functions The bulk of the CIPSO option parsing/processing in the cipso_v4_sock_getattr() and cipso_v4_skb_getattr() functions are identical, the only real difference being where the functions obtain the CIPSO option itself. This patch creates a new function, cipso_v4_getattr(), which contains the common CIPSO option parsing/processing code and modifies the existing functions to call this new helper function. Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 115 ++++++++++++++++++-------------------------------- 1 file changed, 42 insertions(+), 73 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index ab56a052ce31..805a78e6ed55 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1831,67 +1831,74 @@ socket_setattr_failure: } /** - * cipso_v4_sock_getattr - Get the security attributes from a sock - * @sk: the sock + * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions + * @cipso: the CIPSO v4 option * @secattr: the security attributes * * Description: - * Query @sk to see if there is a CIPSO option attached to the sock and if - * there is return the CIPSO security attributes in @secattr. This function - * requires that @sk be locked, or privately held, but it does not do any - * locking itself. Returns zero on success and negative values on failure. + * Inspect @cipso and return the security attributes in @secattr. Returns zero + * on success and negative values on failure. * */ -int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) +static int cipso_v4_getattr(const unsigned char *cipso, + struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; - struct inet_sock *sk_inet; - unsigned char *cipso_ptr; u32 doi; struct cipso_v4_doi *doi_def; - sk_inet = inet_sk(sk); - if (sk_inet->opt == NULL || sk_inet->opt->cipso == 0) - return -ENOMSG; - cipso_ptr = sk_inet->opt->__data + sk_inet->opt->cipso - - sizeof(struct iphdr); - ret_val = cipso_v4_cache_check(cipso_ptr, cipso_ptr[1], secattr); - if (ret_val == 0) - return ret_val; + if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0) + return 0; - doi = ntohl(get_unaligned((__be32 *)&cipso_ptr[2])); + doi = ntohl(get_unaligned((__be32 *)&cipso[2])); rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); - if (doi_def == NULL) { - rcu_read_unlock(); - return -ENOMSG; - } - + if (doi_def == NULL) + goto getattr_return; /* XXX - This code assumes only one tag per CIPSO option which isn't * really a good assumption to make but since we only support the MAC * tags right now it is a safe assumption. */ - switch (cipso_ptr[6]) { + switch (cipso[6]) { case CIPSO_V4_TAG_RBITMAP: - ret_val = cipso_v4_parsetag_rbm(doi_def, - &cipso_ptr[6], - secattr); + ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_ENUM: - ret_val = cipso_v4_parsetag_enum(doi_def, - &cipso_ptr[6], - secattr); + ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_RANGE: - ret_val = cipso_v4_parsetag_rng(doi_def, - &cipso_ptr[6], - secattr); + ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); break; } - rcu_read_unlock(); +getattr_return: + rcu_read_unlock(); return ret_val; } +/** + * cipso_v4_sock_getattr - Get the security attributes from a sock + * @sk: the sock + * @secattr: the security attributes + * + * Description: + * Query @sk to see if there is a CIPSO option attached to the sock and if + * there is return the CIPSO security attributes in @secattr. This function + * requires that @sk be locked, or privately held, but it does not do any + * locking itself. Returns zero on success and negative values on failure. + * + */ +int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) +{ + struct ip_options *opt; + + opt = inet_sk(sk)->opt; + if (opt == NULL || opt->cipso == 0) + return -ENOMSG; + + return cipso_v4_getattr(opt->__data + opt->cipso - sizeof(struct iphdr), + secattr); +} + /** * cipso_v4_skbuff_getattr - Get the security attributes from the CIPSO option * @skb: the packet @@ -1905,45 +1912,7 @@ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) int cipso_v4_skbuff_getattr(const struct sk_buff *skb, struct netlbl_lsm_secattr *secattr) { - int ret_val = -ENOMSG; - unsigned char *cipso_ptr; - u32 doi; - struct cipso_v4_doi *doi_def; - - cipso_ptr = CIPSO_V4_OPTPTR(skb); - if (cipso_v4_cache_check(cipso_ptr, cipso_ptr[1], secattr) == 0) - return 0; - - doi = ntohl(get_unaligned((__be32 *)&cipso_ptr[2])); - rcu_read_lock(); - doi_def = cipso_v4_doi_search(doi); - if (doi_def == NULL) - goto skbuff_getattr_return; - - /* XXX - This code assumes only one tag per CIPSO option which isn't - * really a good assumption to make but since we only support the MAC - * tags right now it is a safe assumption. */ - switch (cipso_ptr[6]) { - case CIPSO_V4_TAG_RBITMAP: - ret_val = cipso_v4_parsetag_rbm(doi_def, - &cipso_ptr[6], - secattr); - break; - case CIPSO_V4_TAG_ENUM: - ret_val = cipso_v4_parsetag_enum(doi_def, - &cipso_ptr[6], - secattr); - break; - case CIPSO_V4_TAG_RANGE: - ret_val = cipso_v4_parsetag_rng(doi_def, - &cipso_ptr[6], - secattr); - break; - } - -skbuff_getattr_return: - rcu_read_unlock(); - return ret_val; + return cipso_v4_getattr(CIPSO_V4_OPTPTR(skb), secattr); } /* -- cgit v1.2.3 From 76c72d4f44ec5fb7f88eda8a0d3aa30922c891d1 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Sun, 16 Sep 2007 15:44:27 -0700 Subject: [IPV4/IPV6/DECNET]: Small cleanup for fib rules. This patch slightly cleanups FIB rules framework. rules_list as a pointer on struct fib_rules_ops is useless. It is always assigned with a static per/subsystem list in IPv4, IPv6 and DecNet. Signed-off-by: Denis V. Lunev Acked-by: Alexey Kuznetsov Signed-off-by: David S. Miller --- net/ipv4/fib_rules.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 2a947840210e..f16839c6a721 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -76,8 +76,6 @@ static struct fib4_rule local_rule = { }, }; -static LIST_HEAD(fib4_rules); - #ifdef CONFIG_NET_CLS_ROUTE u32 fib_rules_tclass(struct fib_result *res) { @@ -279,9 +277,9 @@ static u32 fib4_rule_default_pref(void) struct list_head *pos; struct fib_rule *rule; - if (!list_empty(&fib4_rules)) { - pos = fib4_rules.next; - if (pos->next != &fib4_rules) { + if (!list_empty(&fib4_rules_ops.rules_list)) { + pos = fib4_rules_ops.rules_list.next; + if (pos->next != &fib4_rules_ops.rules_list) { rule = list_entry(pos->next, struct fib_rule, list); if (rule->pref) return rule->pref - 1; @@ -317,15 +315,15 @@ static struct fib_rules_ops fib4_rules_ops = { .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, .policy = fib4_rule_policy, - .rules_list = &fib4_rules, + .rules_list = LIST_HEAD_INIT(fib4_rules_ops.rules_list), .owner = THIS_MODULE, }; void __init fib4_rules_init(void) { - list_add_tail(&local_rule.common.list, &fib4_rules); - list_add_tail(&main_rule.common.list, &fib4_rules); - list_add_tail(&default_rule.common.list, &fib4_rules); + list_add_tail(&local_rule.common.list, &fib4_rules_ops.rules_list); + list_add_tail(&main_rule.common.list, &fib4_rules_ops.rules_list); + list_add_tail(&default_rule.common.list, &fib4_rules_ops.rules_list); fib_rules_register(&fib4_rules_ops); } -- cgit v1.2.3 From 0cfad07555312468296ea3bbbcdf99038f58678b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 16 Sep 2007 16:24:44 -0700 Subject: [NETLINK]: Avoid pointer in netlink_run_queue I was looking at Patrick's fix to inet_diag and it occured to me that we're using a pointer argument to return values unnecessarily in netlink_run_queue. Changing it to return the value will allow the compiler to generate better code since the value won't have to be memory-backed. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 031cc4856b49..b04a6ee5a9a1 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -845,7 +845,7 @@ static void inet_diag_rcv(struct sock *sk, int len) do { mutex_lock(&inet_diag_mutex); - netlink_run_queue(sk, &qlen, &inet_diag_rcv_msg); + qlen = netlink_run_queue(sk, qlen, &inet_diag_rcv_msg); mutex_unlock(&inet_diag_mutex); } while (qlen); } -- cgit v1.2.3 From c40f6fff401f693f627d3d44ef7663b993943517 Mon Sep 17 00:00:00 2001 From: Denis Cheng Date: Sun, 16 Sep 2007 16:39:25 -0700 Subject: [IPV4] af_inet.c: use ARRAY_SIZE macro from kernel.h instead Signed-off-by: Denis Cheng Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 110a19edacc8..e594a2c89661 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -942,7 +942,7 @@ static struct inet_protosw inetsw_array[] = } }; -#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw)) +#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array) void inet_register_protosw(struct inet_protosw *p) { -- cgit v1.2.3 From 96793b482540f3a26e2188eaf75cb56b7829d3e3 Mon Sep 17 00:00:00 2001 From: David L Stevens Date: Mon, 17 Sep 2007 09:57:33 -0700 Subject: [IPV4]: Add ICMPMsgStats MIB (RFC 4293) Background: RFC 4293 deprecates existing individual, named ICMP type counters to be replaced with the ICMPMsgStatsTable. This table includes entries for both IPv4 and IPv6, and requires counting of all ICMP types, whether or not the machine implements the type. These patches "remove" (but not really) the existing counters, and replace them with the ICMPMsgStats tables for v4 and v6. It includes the named counters in the /proc places they were, but gets the values for them from the new tables. It also counts packets generated from raw socket output (e.g., OutEchoes, MLD queries, RA's from radvd, etc). Changes: 1) create icmpmsg_statistics mib 2) create icmpv6msg_statistics mib 3) modify existing counters to use these 4) modify /proc/net/snmp to add "IcmpMsg" with all ICMP types listed by number for easy SNMP parsing 5) modify /proc/net/snmp printing for "Icmp" to get the named data from new counters. Signed-off-by: David L Stevens Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 6 +++ net/ipv4/icmp.c | 53 +++-------------------- net/ipv4/ip_output.c | 4 ++ net/ipv4/proc.c | 120 ++++++++++++++++++++++++++++++++++++++------------- net/ipv4/raw.c | 3 ++ 5 files changed, 108 insertions(+), 78 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e594a2c89661..621b128897d7 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1302,6 +1302,10 @@ static int __init init_ipv4_mibs(void) sizeof(struct icmp_mib), __alignof__(struct icmp_mib)) < 0) goto err_icmp_mib; + if (snmp_mib_init((void **)icmpmsg_statistics, + sizeof(struct icmpmsg_mib), + __alignof__(struct icmpmsg_mib)) < 0) + goto err_icmpmsg_mib; if (snmp_mib_init((void **)tcp_statistics, sizeof(struct tcp_mib), __alignof__(struct tcp_mib)) < 0) @@ -1324,6 +1328,8 @@ err_udplite_mib: err_udp_mib: snmp_mib_free((void **)tcp_statistics); err_tcp_mib: + snmp_mib_free((void **)icmpmsg_statistics); +err_icmpmsg_mib: snmp_mib_free((void **)icmp_statistics); err_icmp_mib: snmp_mib_free((void **)ip_statistics); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 68a22670f597..272c69e106e9 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -115,6 +115,7 @@ struct icmp_bxm { * Statistics */ DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly; +DEFINE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics) __read_mostly; /* An array of errno for error messages from dest unreach. */ /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ @@ -214,8 +215,6 @@ int sysctl_icmp_errors_use_inbound_ifaddr __read_mostly; */ struct icmp_control { - int output_entry; /* Field for increment on output */ - int input_entry; /* Field for increment on input */ void (*handler)(struct sk_buff *skb); short error; /* This ICMP is classed as an error message */ }; @@ -316,12 +315,10 @@ out: /* * Maintain the counters used in the SNMP statistics for outgoing ICMP */ -static void icmp_out_count(int type) +void icmp_out_count(unsigned char type) { - if (type <= NR_ICMP_TYPES) { - ICMP_INC_STATS(icmp_pointers[type].output_entry); - ICMP_INC_STATS(ICMP_MIB_OUTMSGS); - } + ICMPMSGOUT_INC_STATS(type); + ICMP_INC_STATS(ICMP_MIB_OUTMSGS); } /* @@ -390,7 +387,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) return; icmp_param->data.icmph.checksum = 0; - icmp_out_count(icmp_param->data.icmph.type); inet->tos = ip_hdr(skb)->tos; daddr = ipc.addr = rt->rt_src; @@ -952,6 +948,7 @@ int icmp_rcv(struct sk_buff *skb) icmph = icmp_hdr(skb); + ICMPMSGIN_INC_STATS_BH(icmph->type); /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * @@ -986,7 +983,6 @@ int icmp_rcv(struct sk_buff *skb) } } - ICMP_INC_STATS_BH(icmp_pointers[icmph->type].input_entry); icmp_pointers[icmph->type].handler(skb); drop: @@ -1002,109 +998,71 @@ error: */ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { [ICMP_ECHOREPLY] = { - .output_entry = ICMP_MIB_OUTECHOREPS, - .input_entry = ICMP_MIB_INECHOREPS, .handler = icmp_discard, }, [1] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_INERRORS, .handler = icmp_discard, .error = 1, }, [2] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_INERRORS, .handler = icmp_discard, .error = 1, }, [ICMP_DEST_UNREACH] = { - .output_entry = ICMP_MIB_OUTDESTUNREACHS, - .input_entry = ICMP_MIB_INDESTUNREACHS, .handler = icmp_unreach, .error = 1, }, [ICMP_SOURCE_QUENCH] = { - .output_entry = ICMP_MIB_OUTSRCQUENCHS, - .input_entry = ICMP_MIB_INSRCQUENCHS, .handler = icmp_unreach, .error = 1, }, [ICMP_REDIRECT] = { - .output_entry = ICMP_MIB_OUTREDIRECTS, - .input_entry = ICMP_MIB_INREDIRECTS, .handler = icmp_redirect, .error = 1, }, [6] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_INERRORS, .handler = icmp_discard, .error = 1, }, [7] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_INERRORS, .handler = icmp_discard, .error = 1, }, [ICMP_ECHO] = { - .output_entry = ICMP_MIB_OUTECHOS, - .input_entry = ICMP_MIB_INECHOS, .handler = icmp_echo, }, [9] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_INERRORS, .handler = icmp_discard, .error = 1, }, [10] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_INERRORS, .handler = icmp_discard, .error = 1, }, [ICMP_TIME_EXCEEDED] = { - .output_entry = ICMP_MIB_OUTTIMEEXCDS, - .input_entry = ICMP_MIB_INTIMEEXCDS, .handler = icmp_unreach, .error = 1, }, [ICMP_PARAMETERPROB] = { - .output_entry = ICMP_MIB_OUTPARMPROBS, - .input_entry = ICMP_MIB_INPARMPROBS, .handler = icmp_unreach, .error = 1, }, [ICMP_TIMESTAMP] = { - .output_entry = ICMP_MIB_OUTTIMESTAMPS, - .input_entry = ICMP_MIB_INTIMESTAMPS, .handler = icmp_timestamp, }, [ICMP_TIMESTAMPREPLY] = { - .output_entry = ICMP_MIB_OUTTIMESTAMPREPS, - .input_entry = ICMP_MIB_INTIMESTAMPREPS, .handler = icmp_discard, }, [ICMP_INFO_REQUEST] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_DUMMY, .handler = icmp_discard, }, [ICMP_INFO_REPLY] = { - .output_entry = ICMP_MIB_DUMMY, - .input_entry = ICMP_MIB_DUMMY, .handler = icmp_discard, }, [ICMP_ADDRESS] = { - .output_entry = ICMP_MIB_OUTADDRMASKS, - .input_entry = ICMP_MIB_INADDRMASKS, .handler = icmp_address, }, [ICMP_ADDRESSREPLY] = { - .output_entry = ICMP_MIB_OUTADDRMASKREPS, - .input_entry = ICMP_MIB_INADDRMASKREPS, .handler = icmp_address_reply, }, }; @@ -1146,4 +1104,5 @@ void __init icmp_init(struct net_proto_family *ops) EXPORT_SYMBOL(icmp_err_convert); EXPORT_SYMBOL(icmp_send); EXPORT_SYMBOL(icmp_statistics); +EXPORT_SYMBOL(icmpmsg_statistics); EXPORT_SYMBOL(xrlim_allow); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0f1d7beacf78..77f67b7cb9bf 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1261,6 +1261,10 @@ int ip_push_pending_frames(struct sock *sk) skb->priority = sk->sk_priority; skb->dst = dst_clone(&rt->u.dst); + if (iph->protocol == IPPROTO_ICMP) + icmp_out_count(((struct icmphdr *) + skb_transport_header(skb))->type); + /* Netfilter gets whole the not fragmented skb. */ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 95a8f8f2de71..2015148b41a8 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -124,33 +124,30 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { static const struct snmp_mib snmp4_icmp_list[] = { SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS), SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS), - SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS), - SNMP_MIB_ITEM("InTimeExcds", ICMP_MIB_INTIMEEXCDS), - SNMP_MIB_ITEM("InParmProbs", ICMP_MIB_INPARMPROBS), - SNMP_MIB_ITEM("InSrcQuenchs", ICMP_MIB_INSRCQUENCHS), - SNMP_MIB_ITEM("InRedirects", ICMP_MIB_INREDIRECTS), - SNMP_MIB_ITEM("InEchos", ICMP_MIB_INECHOS), - SNMP_MIB_ITEM("InEchoReps", ICMP_MIB_INECHOREPS), - SNMP_MIB_ITEM("InTimestamps", ICMP_MIB_INTIMESTAMPS), - SNMP_MIB_ITEM("InTimestampReps", ICMP_MIB_INTIMESTAMPREPS), - SNMP_MIB_ITEM("InAddrMasks", ICMP_MIB_INADDRMASKS), - SNMP_MIB_ITEM("InAddrMaskReps", ICMP_MIB_INADDRMASKREPS), SNMP_MIB_ITEM("OutMsgs", ICMP_MIB_OUTMSGS), SNMP_MIB_ITEM("OutErrors", ICMP_MIB_OUTERRORS), - SNMP_MIB_ITEM("OutDestUnreachs", ICMP_MIB_OUTDESTUNREACHS), - SNMP_MIB_ITEM("OutTimeExcds", ICMP_MIB_OUTTIMEEXCDS), - SNMP_MIB_ITEM("OutParmProbs", ICMP_MIB_OUTPARMPROBS), - SNMP_MIB_ITEM("OutSrcQuenchs", ICMP_MIB_OUTSRCQUENCHS), - SNMP_MIB_ITEM("OutRedirects", ICMP_MIB_OUTREDIRECTS), - SNMP_MIB_ITEM("OutEchos", ICMP_MIB_OUTECHOS), - SNMP_MIB_ITEM("OutEchoReps", ICMP_MIB_OUTECHOREPS), - SNMP_MIB_ITEM("OutTimestamps", ICMP_MIB_OUTTIMESTAMPS), - SNMP_MIB_ITEM("OutTimestampReps", ICMP_MIB_OUTTIMESTAMPREPS), - SNMP_MIB_ITEM("OutAddrMasks", ICMP_MIB_OUTADDRMASKS), - SNMP_MIB_ITEM("OutAddrMaskReps", ICMP_MIB_OUTADDRMASKREPS), SNMP_MIB_SENTINEL }; +static struct { + char *name; + int index; +} icmpmibmap[] = { + { "DestUnreachs", ICMP_DEST_UNREACH }, + { "TimeExcds", ICMP_TIME_EXCEEDED }, + { "ParmProbs", ICMP_PARAMETERPROB }, + { "SrcQuenchs", ICMP_SOURCE_QUENCH }, + { "Redirects", ICMP_REDIRECT }, + { "Echos", ICMP_ECHO }, + { "EchoReps", ICMP_ECHOREPLY }, + { "Timestamps", ICMP_TIMESTAMP }, + { "TimestampReps", ICMP_TIMESTAMPREPLY }, + { "AddrMasks", ICMP_ADDRESS }, + { "AddrMaskReps", ICMP_ADDRESSREPLY }, + { 0, 0 } +}; + + static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM), SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN), @@ -251,6 +248,72 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_SENTINEL }; +static void icmpmsg_put(struct seq_file *seq) +{ +#define PERLINE 16 + + int j, i, count; + static int out[PERLINE]; + + count = 0; + for (i = 0; i < ICMPMSG_MIB_MAX; i++) { + + if (snmp_fold_field((void **) icmpmsg_statistics, i)) + out[count++] = i; + if (count < PERLINE) + continue; + + seq_printf(seq, "\nIcmpMsg:"); + for (j = 0; j < PERLINE; ++j) + seq_printf(seq, " %sType%u", i & 0x100 ? "Out" : "In", + i & 0xff); + seq_printf(seq, "\nIcmpMsg: "); + for (j = 0; j < PERLINE; ++j) + seq_printf(seq, " %lu", + snmp_fold_field((void **) icmpmsg_statistics, + out[j])); + seq_putc(seq, '\n'); + } + if (count) { + seq_printf(seq, "\nIcmpMsg:"); + for (j = 0; j < count; ++j) + seq_printf(seq, " %sType%u", out[j] & 0x100 ? "Out" : + "In", out[j] & 0xff); + seq_printf(seq, "\nIcmpMsg:"); + for (j = 0; j < count; ++j) + seq_printf(seq, " %lu", snmp_fold_field((void **) + icmpmsg_statistics, out[j])); + } + +#undef PERLINE +} + +static void icmp_put(struct seq_file *seq) +{ + int i; + + seq_puts(seq, "\nIcmp: InMsgs InErrors"); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " In%s", icmpmibmap[i].name); + seq_printf(seq, " OutMsgs OutErrors"); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " Out%s", icmpmibmap[i].name); + seq_printf(seq, "\nIcmp: %lu %lu", + snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INMSGS), + snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INERRORS)); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " %lu", + snmp_fold_field((void **) icmpmsg_statistics, + icmpmibmap[i].index)); + seq_printf(seq, " %lu %lu", + snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTMSGS), + snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTERRORS)); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " %lu", + snmp_fold_field((void **) icmpmsg_statistics, + icmpmibmap[i].index)); +} + /* * Called from the PROCfs module. This outputs /proc/net/snmp. */ @@ -271,15 +334,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) snmp_fold_field((void **)ip_statistics, snmp4_ipstats_list[i].entry)); - seq_puts(seq, "\nIcmp:"); - for (i = 0; snmp4_icmp_list[i].name != NULL; i++) - seq_printf(seq, " %s", snmp4_icmp_list[i].name); - - seq_puts(seq, "\nIcmp:"); - for (i = 0; snmp4_icmp_list[i].name != NULL; i++) - seq_printf(seq, " %lu", - snmp_fold_field((void **)icmp_statistics, - snmp4_icmp_list[i].entry)); + icmp_put(seq); /* RFC 2011 compatibility */ + icmpmsg_put(seq); seq_puts(seq, "\nTcp:"); for (i = 0; snmp4_tcp_list[i].name != NULL; i++) @@ -336,6 +392,8 @@ static const struct file_operations snmp_seq_fops = { .release = single_release, }; + + /* * Output /proc/net/netstat */ diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 216e01b0f44a..07070c7067f3 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -314,6 +314,9 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } + if (iph->protocol == IPPROTO_ICMP) + icmp_out_count(((struct icmphdr *) + skb_transport_header(skb))->type); err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output); -- cgit v1.2.3 From 5ee3afba88f5a79d0bff07ddd87af45919259f91 Mon Sep 17 00:00:00 2001 From: Rick Jones Date: Tue, 18 Sep 2007 13:26:31 -0700 Subject: [TCP]: Return useful listenq info in tcp_info and INET_DIAG_INFO. Return some useful information such as the maximum listen backlog and the current listen backlog in the tcp_info structure and INET_DIAG_INFO. Signed-off-by: Rick Jones Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 9 +++++++-- net/ipv4/tcp_diag.c | 8 +++++--- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 18c64c74f6d5..4f322003835d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2031,8 +2031,13 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; - info->tcpi_unacked = tp->packets_out; - info->tcpi_sacked = tp->sacked_out; + if (sk->sk_state == TCP_LISTEN) { + info->tcpi_unacked = sk->sk_ack_backlog; + info->tcpi_sacked = sk->sk_max_ack_backlog; + } else { + info->tcpi_unacked = tp->packets_out; + info->tcpi_sacked = tp->sacked_out; + } info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; info->tcpi_fackets = tp->fackets_out; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 57c5f0b10e6c..3904d2158a92 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -25,11 +25,13 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, const struct tcp_sock *tp = tcp_sk(sk); struct tcp_info *info = _info; - if (sk->sk_state == TCP_LISTEN) + if (sk->sk_state == TCP_LISTEN) { r->idiag_rqueue = sk->sk_ack_backlog; - else + r->idiag_wqueue = sk->sk_max_ack_backlog; + } else { r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; - r->idiag_wqueue = tp->write_seq - tp->snd_una; + r->idiag_wqueue = tp->write_seq - tp->snd_una; + } if (info != NULL) tcp_get_info(sk, info); } -- cgit v1.2.3 From 0dde7b5404a3d52dcd9ce66d46197f6c3ca97dda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 20 Sep 2007 11:28:05 -0700 Subject: [TCP]: Maintain highest_sack accurately to the highest skb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In general, it should not be necessary to call tcp_fragment for already SACKed skbs, but it's better to be safe than sorry. And indeed, it can be called from sacktag when a DSACK arrives or some ACK (with SACK) reordering occurs (sacktag could be made to avoid the call in the latter case though I'm not sure if it's worth of the trouble and added complexity to cover such marginal case). The collapse case has return for SACKED_ACKED case earlier, so just WARN_ON if internal inconsistency is detected for some reason. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d65d17bb2a09..9df5b2a4da6e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -692,6 +692,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + if (tp->sacked_out && (TCP_SKB_CB(skb)->seq == tp->highest_sack)) + tp->highest_sack = TCP_SKB_CB(buff)->seq; + /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); @@ -1723,6 +1726,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + if (WARN_ON(tp->sacked_out && + (TCP_SKB_CB(next_skb)->seq == tp->highest_sack))) + return; + /* Merge over control information. */ flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ TCP_SKB_CB(skb)->flags = flags; -- cgit v1.2.3 From 91fed7a15c9222af29a653ecb0ee72cff178fdd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 9 Oct 2007 01:24:15 -0700 Subject: [TCP]: Make fackets_out accurate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Substraction for fackets_out is unconditional when snd_una advances, thus there's no need to do it inside the loop. Just make sure correct bounds are honored. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 +++------- net/ipv4/tcp_output.c | 44 ++++++++++++++++++++++++++------------------ 2 files changed, 29 insertions(+), 25 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f9e4d7ad68b7..46aedd6ca020 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2299,8 +2299,8 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) * 1. Reno does not count dupacks (sacked_out) automatically. */ if (!tp->packets_out) tp->sacked_out = 0; - /* 2. SACK counts snd_fack in packets inaccurately. */ - if (tp->sacked_out == 0) + + if (WARN_ON(!tp->sacked_out && tp->fackets_out)) tp->fackets_out = 0; /* Now state machine starts. @@ -2568,10 +2568,6 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, } else if (*seq_rtt < 0) *seq_rtt = now - scb->when; - if (tp->fackets_out) { - __u32 dval = min(tp->fackets_out, packets_acked); - tp->fackets_out -= dval; - } /* hint's skb might be NULL but we don't need to care */ tp->fastpath_cnt_hint -= min_t(u32, packets_acked, tp->fastpath_cnt_hint); @@ -2657,7 +2653,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) seq_rtt = now - scb->when; last_ackt = skb->tstamp; } - tcp_dec_pcount_approx(&tp->fackets_out, skb); tp->packets_out -= tcp_skb_pcount(skb); tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); @@ -2672,6 +2667,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_rearm_rto(sk); + tp->fackets_out -= min(pkts_acked, tp->fackets_out); if (tcp_is_reno(tp)) tcp_remove_reno_sacks(sk, pkts_acked); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9df5b2a4da6e..cbe8bf6dab5f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -652,6 +652,26 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned } } +/* When a modification to fackets out becomes necessary, we need to check + * skb is counted to fackets_out or not. Another important thing is to + * tweak SACK fastpath hint too as it would overwrite all changes unless + * hint is also changed. + */ +static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb, + int decr) +{ + if (!tp->sacked_out) + return; + + if (!before(tp->highest_sack, TCP_SKB_CB(skb)->seq)) + tp->fackets_out -= decr; + + /* cnt_hint is "off-by-one" compared with fackets_out (see sacktag) */ + if (tp->fastpath_skb_hint != NULL && + after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, TCP_SKB_CB(skb)->seq)) + tp->fastpath_cnt_hint -= decr; +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. @@ -746,21 +766,12 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) tp->lost_out -= diff; - if (diff > 0) { - /* Adjust Reno SACK estimate. */ - if (tcp_is_reno(tp)) { - tcp_dec_pcount_approx_int(&tp->sacked_out, diff); - tcp_verify_left_out(tp); - } - - tcp_dec_pcount_approx_int(&tp->fackets_out, diff); - /* SACK fastpath might overwrite it unless dealt with */ - if (tp->fastpath_skb_hint != NULL && - after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, - TCP_SKB_CB(skb)->seq)) { - tcp_dec_pcount_approx_int(&tp->fastpath_cnt_hint, diff); - } + /* Adjust Reno SACK estimate. */ + if (tcp_is_reno(tp) && diff > 0) { + tcp_dec_pcount_approx_int(&tp->sacked_out, diff); + tcp_verify_left_out(tp); } + tcp_adjust_fackets_out(tp, skb, diff); } /* Link BUFF into the send queue. */ @@ -1746,10 +1757,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m if (tcp_is_reno(tp) && tp->sacked_out) tcp_dec_pcount_approx(&tp->sacked_out, next_skb); - /* Not quite right: it can be > snd.fack, but - * it is better to underestimate fackets. - */ - tcp_dec_pcount_approx(&tp->fackets_out, next_skb); + tcp_adjust_fackets_out(tp, skb, tcp_skb_pcount(next_skb)); tp->packets_out -= tcp_skb_pcount(next_skb); sk_stream_free_skb(sk, next_skb); } -- cgit v1.2.3 From 5af4ec236f7c98f3671fb26731457a172d85e0e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 20 Sep 2007 11:30:48 -0700 Subject: [TCP]: clear_all_retrans_hints prefixed by tcp_ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In addition, fix its function comment spacing. Signed-off-by: Ilpo Järvinen --- net/ipv4/tcp_input.c | 10 +++++----- net/ipv4/tcp_output.c | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 46aedd6ca020..31e7e339b567 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1667,7 +1667,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1738,7 +1738,7 @@ void tcp_enter_loss(struct sock *sk, int how) /* Abort FRTO algorithm if one is in progress */ tp->frto_counter = 0; - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk) @@ -2103,7 +2103,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) /* There is something screwy going on with the retrans hints after an undo */ - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) @@ -2196,7 +2196,7 @@ static int tcp_try_undo_loss(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); DBGUNDO(sk, "partial loss"); tp->lost_out = 0; @@ -2656,7 +2656,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tp->packets_out -= tcp_skb_pcount(skb); tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } if (acked&FLAG_ACKED) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cbe8bf6dab5f..f46d24b8410f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -687,7 +687,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss BUG_ON(len > skb->len); - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -1719,7 +1719,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m tcp_skb_pcount(next_skb) != 1); /* changing transmit queue under us so clear hints */ - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); /* Ok. We will be able to collapse the packet. */ tcp_unlink_write_queue(next_skb, sk); @@ -1792,7 +1792,7 @@ void tcp_simple_retransmit(struct sock *sk) } } - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); if (!lost) return; -- cgit v1.2.3 From 13fcf850cc20373db4dd8a5c9f349583ab3817c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 9 Oct 2007 01:28:45 -0700 Subject: [TCP]: Move accounting from tso_acked to clean_rtx_queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The accounting code is pretty much the same, so it's a shame we do it in two places. I'm not too sure if added fully_acked check in MTU probing is really what we want perhaps the added end_seq could be used in the after() comparison. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 81 +++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 46 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 31e7e339b567..4238ed98acb9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2525,14 +2525,12 @@ static void tcp_rearm_rto(struct sock *sk) } } -static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, - __u32 now, __s32 *seq_rtt) +static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); __u32 seq = tp->snd_una; __u32 packets_acked; - int acked = 0; /* If we get here, the whole TSO packet has not been * acked. @@ -2545,39 +2543,11 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, packets_acked -= tcp_skb_pcount(skb); if (packets_acked) { - __u8 sacked = scb->sacked; - - acked |= FLAG_DATA_ACKED; - if (sacked) { - if (sacked & TCPCB_RETRANS) { - if (sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out -= packets_acked; - acked |= FLAG_RETRANS_DATA_ACKED; - *seq_rtt = -1; - } else if (*seq_rtt < 0) - *seq_rtt = now - scb->when; - if (sacked & TCPCB_SACKED_ACKED) - tp->sacked_out -= packets_acked; - if (sacked & TCPCB_LOST) - tp->lost_out -= packets_acked; - if (sacked & TCPCB_URG) { - if (tp->urg_mode && - !before(seq, tp->snd_up)) - tp->urg_mode = 0; - } - } else if (*seq_rtt < 0) - *seq_rtt = now - scb->when; - - /* hint's skb might be NULL but we don't need to care */ - tp->fastpath_cnt_hint -= min_t(u32, packets_acked, - tp->fastpath_cnt_hint); - tp->packets_out -= packets_acked; - BUG_ON(tcp_skb_pcount(skb) == 0); BUG_ON(!before(scb->seq, scb->end_seq)); } - return acked; + return packets_acked; } /* Remove acknowledged frames from the retransmission queue. */ @@ -2587,6 +2557,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; __u32 now = tcp_time_stamp; + int fully_acked = 1; int acked = 0; int prior_packets = tp->packets_out; __s32 seq_rtt = -1; @@ -2595,6 +2566,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + u32 end_seq; + u32 packets_acked; __u8 sacked = scb->sacked; /* If our packet is before the ack sequence we can @@ -2602,11 +2575,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) * the other end. */ if (after(scb->end_seq, tp->snd_una)) { - if (tcp_skb_pcount(skb) > 1 && - after(tp->snd_una, scb->seq)) - acked |= tcp_tso_acked(sk, skb, - now, &seq_rtt); - break; + if (tcp_skb_pcount(skb) == 1 || + !after(tp->snd_una, scb->seq)) + break; + + packets_acked = tcp_tso_acked(sk, skb); + if (!packets_acked) + break; + + fully_acked = 0; + end_seq = tp->snd_una; + } else { + packets_acked = tcp_skb_pcount(skb); + end_seq = scb->end_seq; } /* Initial outgoing SYN's get put onto the write_queue @@ -2624,7 +2605,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } /* MTU probing checks */ - if (icsk->icsk_mtup.probe_size) { + if (fully_acked && icsk->icsk_mtup.probe_size) { if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) { tcp_mtup_probe_success(sk, skb); } @@ -2633,27 +2614,32 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) if (sacked) { if (sacked & TCPCB_RETRANS) { if (sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out -= tcp_skb_pcount(skb); + tp->retrans_out -= packets_acked; acked |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - last_ackt = skb->tstamp; + if (fully_acked) + last_ackt = skb->tstamp; } if (sacked & TCPCB_SACKED_ACKED) - tp->sacked_out -= tcp_skb_pcount(skb); + tp->sacked_out -= packets_acked; if (sacked & TCPCB_LOST) - tp->lost_out -= tcp_skb_pcount(skb); + tp->lost_out -= packets_acked; if (sacked & TCPCB_URG) { - if (tp->urg_mode && - !before(scb->end_seq, tp->snd_up)) + if (tp->urg_mode && !before(end_seq, tp->snd_up)) tp->urg_mode = 0; } } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - last_ackt = skb->tstamp; + if (fully_acked) + last_ackt = skb->tstamp; } - tp->packets_out -= tcp_skb_pcount(skb); + tp->packets_out -= packets_acked; + + if (!fully_acked) + break; + tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); tcp_clear_all_retrans_hints(tp); @@ -2668,6 +2654,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tcp_rearm_rto(sk); tp->fackets_out -= min(pkts_acked, tp->fackets_out); + /* hint's skb might be NULL but we don't need to care */ + tp->fastpath_cnt_hint -= min_t(u32, pkts_acked, + tp->fastpath_cnt_hint); if (tcp_is_reno(tp)) tcp_remove_reno_sacks(sk, pkts_acked); -- cgit v1.2.3 From 7c46a03e67d11d917d6c3dbf501b465b2ca97a6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 20 Sep 2007 11:33:43 -0700 Subject: [TCP]: Cleanup tcp_tso_acked and tcp_clean_rtx_queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements following cleanups: - Comment re-placement (CodingStyle) - tcp_tso_acked() local (wrapper-like) variable removal (readability) - __-types removed (IMHO they make local variables jumpy looking and just was space) - acked -> flag (naming conventions elsewhere in TCP code) - linebreak adjustments (readability) - nested if()s combined (reduced indentation) - clarifying newlines added Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 66 ++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 36 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4238ed98acb9..8fe754be8076 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2525,55 +2525,49 @@ static void tcp_rearm_rto(struct sock *sk) } } +/* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - __u32 seq = tp->snd_una; - __u32 packets_acked; + u32 packets_acked; - /* If we get here, the whole TSO packet has not been - * acked. - */ - BUG_ON(!after(scb->end_seq, seq)); + BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)); packets_acked = tcp_skb_pcount(skb); - if (tcp_trim_head(sk, skb, seq - scb->seq)) + if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return 0; packets_acked -= tcp_skb_pcount(skb); if (packets_acked) { BUG_ON(tcp_skb_pcount(skb) == 0); - BUG_ON(!before(scb->seq, scb->end_seq)); + BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); } return packets_acked; } -/* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) +/* Remove acknowledged frames from the retransmission queue. If our packet + * is before the ack sequence we can discard it as it's confirmed to have + * arrived at the other end. + */ +static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; - __u32 now = tcp_time_stamp; + u32 now = tcp_time_stamp; int fully_acked = 1; - int acked = 0; + int flag = 0; int prior_packets = tp->packets_out; - __s32 seq_rtt = -1; + s32 seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); - while ((skb = tcp_write_queue_head(sk)) && - skb != tcp_send_head(sk)) { + while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u32 end_seq; u32 packets_acked; - __u8 sacked = scb->sacked; + u8 sacked = scb->sacked; - /* If our packet is before the ack sequence we can - * discard it as it's confirmed to have arrived at - * the other end. - */ if (after(scb->end_seq, tp->snd_una)) { if (tcp_skb_pcount(skb) == 1 || !after(tp->snd_una, scb->seq)) @@ -2598,38 +2592,38 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) * quickly. This is severely frowned upon behavior. */ if (!(scb->flags & TCPCB_FLAG_SYN)) { - acked |= FLAG_DATA_ACKED; + flag |= FLAG_DATA_ACKED; } else { - acked |= FLAG_SYN_ACKED; + flag |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; } /* MTU probing checks */ - if (fully_acked && icsk->icsk_mtup.probe_size) { - if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) { - tcp_mtup_probe_success(sk, skb); - } + if (fully_acked && icsk->icsk_mtup.probe_size && + !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) { + tcp_mtup_probe_success(sk, skb); } if (sacked) { if (sacked & TCPCB_RETRANS) { if (sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= packets_acked; - acked |= FLAG_RETRANS_DATA_ACKED; + flag |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; if (fully_acked) last_ackt = skb->tstamp; } + if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= packets_acked; if (sacked & TCPCB_LOST) tp->lost_out -= packets_acked; - if (sacked & TCPCB_URG) { - if (tp->urg_mode && !before(end_seq, tp->snd_up)) - tp->urg_mode = 0; - } + + if ((sacked & TCPCB_URG) && tp->urg_mode && + !before(end_seq, tp->snd_up)) + tp->urg_mode = 0; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; if (fully_acked) @@ -2645,12 +2639,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tcp_clear_all_retrans_hints(tp); } - if (acked&FLAG_ACKED) { + if (flag & FLAG_ACKED) { u32 pkts_acked = prior_packets - tp->packets_out; const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - tcp_ack_update_rtt(sk, acked, seq_rtt); + tcp_ack_update_rtt(sk, flag, seq_rtt); tcp_rearm_rto(sk); tp->fackets_out -= min(pkts_acked, tp->fackets_out); @@ -2664,7 +2658,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) s32 rtt_us = -1; /* Is the ACK triggering packet unambiguous? */ - if (!(acked & FLAG_RETRANS_DATA_ACKED)) { + if (!(flag & FLAG_RETRANS_DATA_ACKED)) { /* High resolution needed and available? */ if (ca_ops->flags & TCP_CONG_RTT_STAMP && !ktime_equal(last_ackt, @@ -2703,7 +2697,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } #endif *seq_rtt_p = seq_rtt; - return acked; + return flag; } static void tcp_ack_probe(struct sock *sk) -- cgit v1.2.3 From 009a2e3e4ec395a290b9e4c7c9ff99296fd6b7d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 20 Sep 2007 11:34:38 -0700 Subject: [TCP] FRTO: Improve interoperability with other undo_marker users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Basically this change enables it, previously other undo_marker users were left with nothing. Reverse undo_marker logic completely to get it set right in CA_Loss. On the other hand, when spurious RTO is detected, clear it. Clearing might be too heavy for some scenarios but seems safe enough starting point for now and shouldn't have much effect except in majority of cases (if in any). By adding a new FLAG_ we avoid looping through write_queue when RTO occurs. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8fe754be8076..0feb10935be1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -104,6 +104,7 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained DSACK info */ +#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -1594,6 +1595,8 @@ void tcp_enter_frto(struct sock *sk) tp->undo_retrans = 0; skb = tcp_write_queue_head(sk); + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + tp->undo_marker = 0; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); @@ -1643,6 +1646,8 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) /* ...enter this if branch just for the first segment */ flag |= FLAG_DATA_ACKED; } else { + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); } @@ -1658,7 +1663,6 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tp->undo_marker = 0; tp->frto_counter = 0; tp->reordering = min_t(unsigned int, tp->reordering, @@ -2584,20 +2588,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) end_seq = scb->end_seq; } - /* Initial outgoing SYN's get put onto the write_queue - * just like anything else we transmit. It is not - * true data, and if we misinform our callers that - * this ACK acks real data, we will erroneously exit - * connection startup slow start one packet too - * quickly. This is severely frowned upon behavior. - */ - if (!(scb->flags & TCPCB_FLAG_SYN)) { - flag |= FLAG_DATA_ACKED; - } else { - flag |= FLAG_SYN_ACKED; - tp->retrans_stamp = 0; - } - /* MTU probing checks */ if (fully_acked && icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) { @@ -2610,6 +2600,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) tp->retrans_out -= packets_acked; flag |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; + if ((flag & FLAG_DATA_ACKED) || + (packets_acked > 1)) + flag |= FLAG_NONHEAD_RETRANS_ACKED; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; if (fully_acked) @@ -2631,6 +2624,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) } tp->packets_out -= packets_acked; + /* Initial outgoing SYN's get put onto the write_queue + * just like anything else we transmit. It is not + * true data, and if we misinform our callers that + * this ACK acks real data, we will erroneously exit + * connection startup slow start one packet too + * quickly. This is severely frowned upon behavior. + */ + if (!(scb->flags & TCPCB_FLAG_SYN)) { + flag |= FLAG_DATA_ACKED; + } else { + flag |= FLAG_SYN_ACKED; + tp->retrans_stamp = 0; + } + if (!fully_acked) break; @@ -2852,6 +2859,10 @@ static int tcp_process_frto(struct sock *sk, int flag) if (flag&FLAG_DATA_ACKED) inet_csk(sk)->icsk_retransmits = 0; + if ((flag & FLAG_NONHEAD_RETRANS_ACKED) || + ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED))) + tp->undo_marker = 0; + if (!before(tp->snd_una, tp->frto_highmark)) { tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); return 1; @@ -2916,6 +2927,7 @@ static int tcp_process_frto(struct sock *sk, int flag) break; } tp->frto_counter = 0; + tp->undo_marker = 0; } return 0; } -- cgit v1.2.3 From c96fd3d461fa495400df24be3b3b66f0e0b152f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 20 Sep 2007 11:36:37 -0700 Subject: [TCP]: Enable SACK enhanced FRTO (RFC4138) by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most of the description that follows comes from my mail to netdev (some editing done): Main obstacle to FRTO use is its deployment as it has to be on the sender side where as wireless link is often the receiver's access link. Take initiative on behalf of unlucky receivers and enable it by default in future Linux TCP senders. Also IETF seems to interested in advancing FRTO from experimental [1]. How does FRTO help? =================== FRTO detects spurious RTOs and avoids a number of unnecessary retransmissions and a couple of other problems that can arise due to incorrect guess made at RTO (i.e., that segments were lost when they actually got delayed which is likely to occur e.g. in wireless environments with link-layer retransmission). Though FRTO cannot prevent the first (potentially unnecessary) retransmission at RTO, I suspect that it won't cost that much even if you have to pay for each bit (won't be that high percentage out of all packets after all :-)). However, usually when you have a spurious RTO, not only the first segment unnecessarily retransmitted but the *whole window*. It goes like this: all cumulative ACKs got delayed due to in-order delivery, then TCP will actually send 1.5*original cwnd worth of data in the RTO's slow-start when the delayed ACKs arrive (basically the original cwnd worth of it unnecessarily). In case one is interested in minimizing unnecessary retransmissions e.g. due to cost, those rexmissions must never see daylight. Besides, in the worst case the generated burst overloads the bottleneck buffers which is likely to significantly delay the further progress of the flow. In case of ll rexmissions, ACK compression often occurs at the same time making the burst very "sharp edged" (in that case TCP often loses most of the segments above high_seq => very bad performance too). When FRTO is enabled, those unnecessary retransmissions are fully avoided except for the first segment and the cwnd behavior after detected spurious RTO is determined by the response (one can tune that by sysctl). Basic version (non-SACK enhanced one), FRTO can fail to detect spurious RTO as spurious and falls back to conservative behavior. ACK lossage is much less significant than reordering, usually the FRTO can detect spurious RTO if at least 2 cumulative ACKs from original window are preserved (excluding the ACK that advances to high_seq). With SACK-enhanced version, the detection is quite robust. FRTO should remove the need to set a high lower bound for the RTO estimator due to delay spikes that occur relatively common in some environments (esp. in wireless/cellular ones). [1] http://www1.ietf.org/mail-archive/web/tcpm/current/msg02862.html Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0feb10935be1..65b9f274a774 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -85,7 +85,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 2; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -int sysctl_tcp_frto __read_mostly; +int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_nometrics_save __read_mostly; -- cgit v1.2.3 From b76892051cf1c04d95872838e70146f65e3b9d75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 20 Sep 2007 11:37:19 -0700 Subject: [TCP]: Avoid clearing sacktag hint in trivial situations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no reason to clear the sacktag skb hint when small part of the rexmit queue changes. Account changes (if any) instead when fragmenting/collapsing. RTO/FRTO do not touch SACKED_ACKED bits so no need to discard SACK tag hint at all. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 14 ++++++++------ net/ipv4/tcp_output.c | 12 ++++++++---- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 65b9f274a774..4c10d9cad20f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1671,7 +1671,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); - tcp_clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1711,10 +1711,14 @@ void tcp_enter_loss(struct sock *sk, int how) tp->bytes_acked = 0; tcp_clear_retrans(tp); - /* Push undo marker, if it was plain RTO and nothing - * was retransmitted. */ - if (!how) + if (!how) { + /* Push undo marker, if it was plain RTO and nothing + * was retransmitted. */ tp->undo_marker = tp->snd_una; + tcp_clear_retrans_hints_partial(tp); + } else { + tcp_clear_all_retrans_hints(tp); + } tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) @@ -1741,8 +1745,6 @@ void tcp_enter_loss(struct sock *sk, int how) TCP_ECN_queue_cwr(tp); /* Abort FRTO algorithm if one is in progress */ tp->frto_counter = 0; - - tcp_clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f46d24b8410f..cbb83acd830a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -687,7 +687,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss BUG_ON(len > skb->len); - tcp_clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -1718,9 +1718,6 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - /* changing transmit queue under us so clear hints */ - tcp_clear_all_retrans_hints(tp); - /* Ok. We will be able to collapse the packet. */ tcp_unlink_write_queue(next_skb, sk); @@ -1759,6 +1756,13 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m tcp_adjust_fackets_out(tp, skb, tcp_skb_pcount(next_skb)); tp->packets_out -= tcp_skb_pcount(next_skb); + + /* changed transmit queue under us so clear hints */ + tcp_clear_retrans_hints_partial(tp); + /* manually tune sacktag skb hint */ + if (tp->fastpath_skb_hint == next_skb) + tp->fastpath_skb_hint = skb; + sk_stream_free_skb(sk, next_skb); } } -- cgit v1.2.3 From de3cb747ffac5f2a4a6bb156e7e2fd5229e688e5 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 25 Sep 2007 19:16:28 -0700 Subject: [NET]: Dynamically allocate the loopback device, part 1. This patch replaces all occurences to the static variable loopback_dev to a pointer loopback_dev. That provides the mindless, trivial, uninteressting change part for the dynamic allocation for the loopback. Signed-off-by: Eric W. Biederman Signed-off-by: Daniel Lezcano Acked-By: Kirill Korotaev Acked-by: Benjamin Thery Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 6 +++--- net/ipv4/ipconfig.c | 6 +++--- net/ipv4/ipvs/ip_vs_core.c | 2 +- net/ipv4/route.c | 18 +++++++++--------- net/ipv4/xfrm4_policy.c | 2 +- 5 files changed, 17 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 721b89b60963..affea9b121fc 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -203,7 +203,7 @@ static void inetdev_destroy(struct in_device *in_dev) ASSERT_RTNL(); dev = in_dev->dev; - if (dev == &loopback_dev) + if (dev == loopback_dev) return; in_dev->dead = 1; @@ -1061,7 +1061,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, in_dev = inetdev_init(dev); if (!in_dev) return notifier_from_errno(-ENOMEM); - if (dev == &loopback_dev) { + if (dev == loopback_dev) { IN_DEV_CONF_SET(in_dev, NOXFRM, 1); IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); } @@ -1077,7 +1077,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, case NETDEV_UP: if (dev->mtu < 68) break; - if (dev == &loopback_dev) { + if (dev == loopback_dev) { struct in_ifaddr *ifa; if ((ifa = inet_alloc_ifa()) != NULL) { ifa->ifa_local = diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 4303851749f6..2d2e0cda0470 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -190,11 +190,11 @@ static int __init ic_open_devs(void) rtnl_lock(); /* bring loopback device up first */ - if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0) - printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev.name); + if (dev_change_flags(loopback_dev, loopback_dev->flags | IFF_UP) < 0) + printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev->name); for_each_netdev(&init_net, dev) { - if (dev == &loopback_dev) + if (dev == loopback_dev) continue; if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : (!(dev->flags & IFF_LOOPBACK) && diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index f005a2f929f4..74503267e5d4 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -961,7 +961,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb, * ... don't know why 1st test DOES NOT include 2nd (?) */ if (unlikely(skb->pkt_type != PACKET_HOST - || skb->dev == &loopback_dev || skb->sk)) { + || skb->dev == loopback_dev || skb->sk)) { IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", skb->pkt_type, ip_hdr(skb)->protocol, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 006d6058a806..ca2878dc6188 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1402,8 +1402,8 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rtable *rt = (struct rtable *) dst; struct in_device *idev = rt->idev; - if (dev != &loopback_dev && idev && idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(&loopback_dev); + if (dev != loopback_dev && idev && idev->dev == dev) { + struct in_device *loopback_idev = in_dev_get(loopback_dev); if (loopback_idev) { rt->idev = loopback_idev; in_dev_put(idev); @@ -1555,7 +1555,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = &loopback_dev; + rth->u.dst.dev = loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->fl.oif = 0; @@ -1812,7 +1812,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { int result; result = fib_validate_source(saddr, daddr, tos, - loopback_dev.ifindex, + loopback_dev->ifindex, dev, &spec_dst, &itag); if (result < 0) goto martian_source; @@ -1879,7 +1879,7 @@ local_input: #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = &loopback_dev; + rth->u.dst.dev = loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->rt_gateway = daddr; @@ -2149,7 +2149,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) RT_SCOPE_UNIVERSE), } }, .mark = oldflp->mark, - .iif = loopback_dev.ifindex, + .iif = loopback_dev->ifindex, .oif = oldflp->oif }; struct fib_result res; unsigned flags = 0; @@ -2243,9 +2243,9 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); if (dev_out) dev_put(dev_out); - dev_out = &loopback_dev; + dev_out = loopback_dev; dev_hold(dev_out); - fl.oif = loopback_dev.ifindex; + fl.oif = loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; @@ -2290,7 +2290,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_src = fl.fl4_dst; if (dev_out) dev_put(dev_out); - dev_out = &loopback_dev; + dev_out = loopback_dev; dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 4ff8ed30024f..29ab3de8c47f 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -306,7 +306,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xdst = (struct xfrm_dst *)dst; if (xdst->u.rt.idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(&loopback_dev); + struct in_device *loopback_idev = in_dev_get(loopback_dev); BUG_ON(!loopback_idev); do { -- cgit v1.2.3 From a6963a6b3d2d3921b60f45e0cf18be26495d5ad1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 25 Sep 2007 22:44:14 -0700 Subject: [TCP]: Re-place highest_sack check to a more robust position MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I previously added checking to position that is rather poor as state has already been adjusted quite a bit. Re-placing it above all state changes should be more robust though the return should never ever get executed regardless of its place :-). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cbb83acd830a..94c80113df7e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1718,6 +1718,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); + if (WARN_ON(tp->sacked_out && + (TCP_SKB_CB(next_skb)->seq == tp->highest_sack))) + return; + /* Ok. We will be able to collapse the packet. */ tcp_unlink_write_queue(next_skb, sk); @@ -1734,10 +1738,6 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; - if (WARN_ON(tp->sacked_out && - (TCP_SKB_CB(next_skb)->seq == tp->highest_sack))) - return; - /* Merge over control information. */ flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ TCP_SKB_CB(skb)->flags = flags; -- cgit v1.2.3 From 93e680202929802558f783fbd96c41697ae65472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 25 Sep 2007 22:46:50 -0700 Subject: [TCP]: Reordered ACK's (old) SACKs not included to discarded MIB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case of ACK reordering, the SACK block might be valid in it's time but is already obsoleted since we've received another kind of confirmation about arrival of the segments through snd_una advancement of an earlier packet. I didn't bother to build distinguishing of valid and invalid SACK blocks but simply made reordered SACK blocks that are too old always not counted regardless of their "real" validity which could be determined by using the ack field of the reordered packet (won't be significant IMHO). DSACKs can very well be considered useful even in this situation, so won't do any of this for them. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4c10d9cad20f..4866e75e98e0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1247,8 +1247,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); else NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD); - } else + } else { + /* Don't count olds caused by ACK reordering */ + if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && + !after(end_seq, tp->snd_una)) + continue; NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); + } continue; } -- cgit v1.2.3 From 912d8f0b1f17b7e851ebbfeb17a16de9f9c7cb88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 25 Sep 2007 22:47:31 -0700 Subject: [TCP] MIB: Count FRTO's successfully detected spurious RTOs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/proc.c | 1 + net/ipv4/tcp_input.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 2015148b41a8..9dee70e1cf0d 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -245,6 +245,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD), SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD), SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO), + SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4866e75e98e0..4b27739031fb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2935,6 +2935,7 @@ static int tcp_process_frto(struct sock *sk, int flag) } tp->frto_counter = 0; tp->undo_marker = 0; + NET_INC_STATS_BH(LINUX_MIB_TCPSPURIOUSRTOS); } return 0; } -- cgit v1.2.3 From 5967789dbc8aafdba5813fa8e8cfce3c90516f83 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 26 Sep 2007 22:09:25 -0700 Subject: [IPV4]: Remove unnecessary test for the loopback device from inetdev_destroy Currently we never call unregister_netdev for the loopback device so it is impossible for us to reach inetdev_destroy with the loopback device. So the test in inetdev_destroy is unnecessary. Further when testing with my network namespace patches removing unregistering the loopback device and calling inetdev_destroy works fine so there appears to be no reason for avoiding unregistering the loopback device. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index affea9b121fc..e7f2b022316e 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -203,8 +203,6 @@ static void inetdev_destroy(struct in_device *in_dev) ASSERT_RTNL(); dev = in_dev->dev; - if (dev == loopback_dev) - return; in_dev->dead = 1; -- cgit v1.2.3 From 0cc217e16cb8ca8ef2544363571fce94259900e0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 26 Sep 2007 22:10:06 -0700 Subject: [IPV4]: When possible test for IFF_LOOPBACK and not dev == loopback_dev Now that multiple loopback devices are becoming possible it makes the code a little cleaner and more maintainable to test if a deivice is th a loopback device by testing dev->flags & IFF_LOOPBACK instead of dev == loopback_dev. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 4 ++-- net/ipv4/ipconfig.c | 10 +++++++--- net/ipv4/ipvs/ip_vs_core.c | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e7f2b022316e..55d199e4ae21 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1059,7 +1059,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, in_dev = inetdev_init(dev); if (!in_dev) return notifier_from_errno(-ENOMEM); - if (dev == loopback_dev) { + if (dev->flags & IFF_LOOPBACK) { IN_DEV_CONF_SET(in_dev, NOXFRM, 1); IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); } @@ -1075,7 +1075,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, case NETDEV_UP: if (dev->mtu < 68) break; - if (dev == loopback_dev) { + if (dev->flags & IFF_LOOPBACK) { struct in_ifaddr *ifa; if ((ifa = inet_alloc_ifa()) != NULL) { ifa->ifa_local = diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 2d2e0cda0470..af5d5b39fc13 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -190,11 +190,15 @@ static int __init ic_open_devs(void) rtnl_lock(); /* bring loopback device up first */ - if (dev_change_flags(loopback_dev, loopback_dev->flags | IFF_UP) < 0) - printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev->name); + for_each_netdev(&init_net, dev) { + if (!(dev->flags & IFF_LOOPBACK)) + continue; + if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) + printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); + } for_each_netdev(&init_net, dev) { - if (dev == loopback_dev) + if (dev->flags & IFF_LOOPBACK) continue; if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : (!(dev->flags & IFF_LOOPBACK) && diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 74503267e5d4..fbca2a2ff29f 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -961,7 +961,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb, * ... don't know why 1st test DOES NOT include 2nd (?) */ if (unlikely(skb->pkt_type != PACKET_HOST - || skb->dev == loopback_dev || skb->sk)) { + || skb->dev->flags & IFF_LOOPBACK || skb->sk)) { IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", skb->pkt_type, ip_hdr(skb)->protocol, -- cgit v1.2.3 From 2774c7aba6c97a2535be3309a2209770953780b3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 26 Sep 2007 22:10:56 -0700 Subject: [NET]: Make the loopback device per network namespace. This patch makes loopback_dev per network namespace. Adding code to create a different loopback device for each network namespace and adding the code to free a loopback device when a network namespace exits. This patch modifies all users the loopback_dev so they access it as init_net.loopback_dev, keeping all of the code compiling and working. A later pass will be needed to update the users to use something other than the initial network namespace. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/route.c | 18 +++++++++--------- net/ipv4/xfrm4_policy.c | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ca2878dc6188..2a9b363e820c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1402,8 +1402,8 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rtable *rt = (struct rtable *) dst; struct in_device *idev = rt->idev; - if (dev != loopback_dev && idev && idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(loopback_dev); + if (dev != init_net.loopback_dev && idev && idev->dev == dev) { + struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev); if (loopback_idev) { rt->idev = loopback_idev; in_dev_put(idev); @@ -1555,7 +1555,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = loopback_dev; + rth->u.dst.dev = init_net.loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->fl.oif = 0; @@ -1812,7 +1812,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { int result; result = fib_validate_source(saddr, daddr, tos, - loopback_dev->ifindex, + init_net.loopback_dev->ifindex, dev, &spec_dst, &itag); if (result < 0) goto martian_source; @@ -1879,7 +1879,7 @@ local_input: #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = loopback_dev; + rth->u.dst.dev = init_net.loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->rt_gateway = daddr; @@ -2149,7 +2149,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) RT_SCOPE_UNIVERSE), } }, .mark = oldflp->mark, - .iif = loopback_dev->ifindex, + .iif = init_net.loopback_dev->ifindex, .oif = oldflp->oif }; struct fib_result res; unsigned flags = 0; @@ -2243,9 +2243,9 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); if (dev_out) dev_put(dev_out); - dev_out = loopback_dev; + dev_out = init_net.loopback_dev; dev_hold(dev_out); - fl.oif = loopback_dev->ifindex; + fl.oif = init_net.loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; @@ -2290,7 +2290,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_src = fl.fl4_dst; if (dev_out) dev_put(dev_out); - dev_out = loopback_dev; + dev_out = init_net.loopback_dev; dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 29ab3de8c47f..329825ca68fe 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -306,7 +306,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xdst = (struct xfrm_dst *)dst; if (xdst->u.rt.idev->dev == dev) { - struct in_device *loopback_idev = in_dev_get(loopback_dev); + struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev); BUG_ON(!loopback_idev); do { -- cgit v1.2.3 From 0c4e85813d0a94eeb8bf813397a4907bdd7bb610 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 9 Oct 2007 01:36:32 -0700 Subject: [NET]: Wrap netdevice hardware header creation. Add inline for common usage of hardware header creation, and fix bug in IPV6 mcast where the assumption about negative return is an errno. Negative return from hard_header means not enough space was available,(ie -N bytes). Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/arp.c | 3 +-- net/ipv4/ipconfig.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 3a683006d761..5b24c65b13c6 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -591,8 +591,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, /* * Fill the device header for the ARP frame */ - if (dev->hard_header && - dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0) + if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0) goto out; /* diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index af5d5b39fc13..c5c107a01823 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -757,8 +757,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d /* Chain packet down the line... */ skb->dev = dev; skb->protocol = htons(ETH_P_IP); - if ((dev->hard_header && - dev->hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0) || + if (dev_hard_header(skb, dev, ntohs(skb->protocol), + dev->broadcast, dev->dev_addr, skb->len) < 0 || dev_queue_xmit(skb) < 0) printk("E"); } -- cgit v1.2.3 From b95cce3576813ac3f86bafa6b5daaaaf7574b0fe Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 26 Sep 2007 22:13:38 -0700 Subject: [NET]: Wrap hard_header_parse Wrap the hard_header_parse function to simplify next step of header_ops conversion. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_queue.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 82fda92e6b97..aaa3f5c56761 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -250,10 +250,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) if (entry->info->indev && entry->skb->dev) { pmsg->hw_type = entry->skb->dev->type; - if (entry->skb->dev->hard_header_parse) - pmsg->hw_addrlen = - entry->skb->dev->hard_header_parse(entry->skb, - pmsg->hw_addr); + pmsg->hw_addrlen = dev_parse_header(entry->skb, + pmsg->hw_addr); } if (data_len) -- cgit v1.2.3 From 3b04ddde02cf1b6f14f2697da5c20eca5715017f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 9 Oct 2007 01:40:57 -0700 Subject: [NET]: Move hardware header operations out of netdevice. Since hardware header operations are part of the protocol class not the device instance, make them into a separate object and save memory. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/arp.c | 6 ++++-- net/ipv4/ip_gre.c | 13 +++++++++---- net/ipv4/ip_output.c | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 5b24c65b13c6..d8248198bcd7 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -253,7 +253,7 @@ static int arp_constructor(struct neighbour *neigh) neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); - if (dev->hard_header == NULL) { + if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &arp_direct_ops; neigh->output = neigh->ops->queue_xmit; @@ -310,10 +310,12 @@ static int arp_constructor(struct neighbour *neigh) neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } - if (dev->hard_header_cache) + + if (dev->header_ops->cache) neigh->ops = &arp_hh_ops; else neigh->ops = &arp_generic_ops; + if (neigh->nud_state&NUD_VALID) neigh->output = neigh->ops->connected_output; else diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index ffa9f1c9dcbb..f151900efaf9 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -684,7 +684,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_error; } - if (dev->hard_header) { + if (dev->header_ops) { gre_hlen = 0; tiph = (struct iphdr*)skb->data; } else { @@ -1063,8 +1063,9 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) */ -static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, - void *daddr, void *saddr, unsigned len) +static int ipgre_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + const void *daddr, const void *saddr, unsigned len) { struct ip_tunnel *t = netdev_priv(dev); struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); @@ -1091,6 +1092,10 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned sh return -t->hlen; } +static const struct header_ops ipgre_header_ops = { + .create = ipgre_header, +}; + static int ipgre_open(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); @@ -1187,7 +1192,7 @@ static int ipgre_tunnel_init(struct net_device *dev) if (!iph->saddr) return -EINVAL; dev->flags = IFF_BROADCAST; - dev->hard_header = ipgre_header; + dev->header_ops = &ipgre_header_ops; dev->open = ipgre_open; dev->stop = ipgre_close; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 77f67b7cb9bf..699f06781fd8 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -169,7 +169,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS); /* Be paranoid, rather than too clever. */ - if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { struct sk_buff *skb2; skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); -- cgit v1.2.3 From df6fb868d6118686805c2fa566e213a8f31c8e4f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 28 Sep 2007 14:37:03 -0700 Subject: [NETFILTER]: nfnetlink: convert to generic netlink attribute functions Get rid of the duplicated rtnetlink macros and use the generic netlink attribute functions. The old duplicated stuff is moved to a new header file that exists just for userspace. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 20 ++++++++--------- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 30 +++++++++++++------------- net/ipv4/netfilter/nf_nat_core.c | 16 +++++++------- 3 files changed, 33 insertions(+), 33 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index f813e02aab30..f8771e058b9e 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -363,32 +363,32 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) static int ipv4_tuple_to_nfattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), + NLA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.u3.ip); - NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), + NLA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.u3.ip); return 0; -nfattr_failure: +nla_put_failure: return -1; } -static const size_t cta_min_ip[CTA_IP_MAX] = { - [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), - [CTA_IP_V4_DST-1] = sizeof(u_int32_t), +static const size_t cta_min_ip[CTA_IP_MAX+1] = { + [CTA_IP_V4_SRC] = sizeof(u_int32_t), + [CTA_IP_V4_DST] = sizeof(u_int32_t), }; -static int ipv4_nfattr_to_tuple(struct nfattr *tb[], +static int ipv4_nfattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t) { - if (!tb[CTA_IP_V4_SRC-1] || !tb[CTA_IP_V4_DST-1]) + if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) return -EINVAL; if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) return -EINVAL; - t->src.u3.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_SRC-1]); - t->dst.u3.ip = *(__be32 *)NFA_DATA(tb[CTA_IP_V4_DST-1]); + t->src.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_SRC]); + t->dst.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_DST]); return 0; } diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 6593fd2c5b10..714332b8869e 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -235,42 +235,42 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, static int icmp_tuple_to_nfattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { - NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), + NLA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), &t->src.u.icmp.id); - NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), + NLA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), &t->dst.u.icmp.type); - NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), + NLA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), &t->dst.u.icmp.code); return 0; -nfattr_failure: +nla_put_failure: return -1; } -static const size_t cta_min_proto[CTA_PROTO_MAX] = { - [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), - [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t), - [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t) +static const size_t cta_min_proto[CTA_PROTO_MAX+1] = { + [CTA_PROTO_ICMP_TYPE] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_CODE] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_ID] = sizeof(u_int16_t) }; -static int icmp_nfattr_to_tuple(struct nfattr *tb[], +static int icmp_nfattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple) { - if (!tb[CTA_PROTO_ICMP_TYPE-1] - || !tb[CTA_PROTO_ICMP_CODE-1] - || !tb[CTA_PROTO_ICMP_ID-1]) + if (!tb[CTA_PROTO_ICMP_TYPE] + || !tb[CTA_PROTO_ICMP_CODE] + || !tb[CTA_PROTO_ICMP_ID]) return -EINVAL; if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) return -EINVAL; tuple->dst.u.icmp.type = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]); + *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMP_TYPE]); tuple->dst.u.icmp.code = - *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); + *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMP_CODE]); tuple->src.u.icmp.id = - *(__be16 *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); + *(__be16 *)nla_data(tb[CTA_PROTO_ICMP_ID]); if (tuple->dst.u.icmp.type >= sizeof(invmap) || !invmap[tuple->dst.u.icmp.type]) diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index deab27facbad..4bdbb128fe50 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -547,38 +547,38 @@ int nf_nat_port_range_to_nfattr(struct sk_buff *skb, const struct nf_nat_range *range) { - NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16), + NLA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16), &range->min.tcp.port); - NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16), + NLA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(__be16), &range->max.tcp.port); return 0; -nfattr_failure: +nla_put_failure: return -1; } EXPORT_SYMBOL_GPL(nf_nat_port_nfattr_to_range); int -nf_nat_port_nfattr_to_range(struct nfattr *tb[], struct nf_nat_range *range) +nf_nat_port_nfattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) { int ret = 0; /* we have to return whether we actually parsed something or not */ - if (tb[CTA_PROTONAT_PORT_MIN-1]) { + if (tb[CTA_PROTONAT_PORT_MIN]) { ret = 1; range->min.tcp.port = - *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]); + *(__be16 *)nla_data(tb[CTA_PROTONAT_PORT_MIN]); } - if (!tb[CTA_PROTONAT_PORT_MAX-1]) { + if (!tb[CTA_PROTONAT_PORT_MAX]) { if (ret) range->max.tcp.port = range->min.tcp.port; } else { ret = 1; range->max.tcp.port = - *(__be16 *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]); + *(__be16 *)nla_data(tb[CTA_PROTONAT_PORT_MAX]); } return ret; -- cgit v1.2.3 From fdf708322d4658daa6eb795d1a835b97efdb335e Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 28 Sep 2007 14:37:41 -0700 Subject: [NETFILTER]: nfnetlink: rename functions containing 'nfattr' There is no struct nfattr anymore, rename functions to 'nlattr'. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 10 +++++----- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 10 +++++----- net/ipv4/netfilter/nf_nat_core.c | 8 ++++---- net/ipv4/netfilter/nf_nat_proto_gre.c | 4 ++-- net/ipv4/netfilter/nf_nat_proto_icmp.c | 4 ++-- net/ipv4/netfilter/nf_nat_proto_tcp.c | 4 ++-- net/ipv4/netfilter/nf_nat_proto_udp.c | 4 ++-- 7 files changed, 22 insertions(+), 22 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index f8771e058b9e..77ca556aad91 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -360,7 +360,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) #include #include -static int ipv4_tuple_to_nfattr(struct sk_buff *skb, +static int ipv4_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { NLA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), @@ -378,13 +378,13 @@ static const size_t cta_min_ip[CTA_IP_MAX+1] = { [CTA_IP_V4_DST] = sizeof(u_int32_t), }; -static int ipv4_nfattr_to_tuple(struct nlattr *tb[], +static int ipv4_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t) { if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) return -EINVAL; - if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) + if (nlattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) return -EINVAL; t->src.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_SRC]); @@ -411,8 +411,8 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .print_conntrack = ipv4_print_conntrack, .get_l4proto = ipv4_get_l4proto, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .tuple_to_nfattr = ipv4_tuple_to_nfattr, - .nfattr_to_tuple = ipv4_nfattr_to_tuple, + .tuple_to_nlattr = ipv4_tuple_to_nlattr, + .nlattr_to_tuple = ipv4_nlattr_to_tuple, #endif #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) .ctl_table_path = nf_net_ipv4_netfilter_sysctl_path, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 714332b8869e..ca7252c10758 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -232,7 +232,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, #include #include -static int icmp_tuple_to_nfattr(struct sk_buff *skb, +static int icmp_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { NLA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), @@ -254,7 +254,7 @@ static const size_t cta_min_proto[CTA_PROTO_MAX+1] = { [CTA_PROTO_ICMP_ID] = sizeof(u_int16_t) }; -static int icmp_nfattr_to_tuple(struct nlattr *tb[], +static int icmp_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple) { if (!tb[CTA_PROTO_ICMP_TYPE] @@ -262,7 +262,7 @@ static int icmp_nfattr_to_tuple(struct nlattr *tb[], || !tb[CTA_PROTO_ICMP_ID]) return -EINVAL; - if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + if (nlattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) return -EINVAL; tuple->dst.u.icmp.type = @@ -327,8 +327,8 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = .destroy = NULL, .me = NULL, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .tuple_to_nfattr = icmp_tuple_to_nfattr, - .nfattr_to_tuple = icmp_nfattr_to_tuple, + .tuple_to_nlattr = icmp_tuple_to_nlattr, + .nlattr_to_tuple = icmp_nlattr_to_tuple, #endif #ifdef CONFIG_SYSCTL .ctl_table_header = &icmp_sysctl_header, diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 4bdbb128fe50..7221aa20e6ff 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -544,7 +544,7 @@ EXPORT_SYMBOL(nf_nat_protocol_unregister); #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) int -nf_nat_port_range_to_nfattr(struct sk_buff *skb, +nf_nat_port_range_to_nlattr(struct sk_buff *skb, const struct nf_nat_range *range) { NLA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(__be16), @@ -557,10 +557,10 @@ nf_nat_port_range_to_nfattr(struct sk_buff *skb, nla_put_failure: return -1; } -EXPORT_SYMBOL_GPL(nf_nat_port_nfattr_to_range); +EXPORT_SYMBOL_GPL(nf_nat_port_nlattr_to_range); int -nf_nat_port_nfattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) +nf_nat_port_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) { int ret = 0; @@ -583,7 +583,7 @@ nf_nat_port_nfattr_to_range(struct nlattr *tb[], struct nf_nat_range *range) return ret; } -EXPORT_SYMBOL_GPL(nf_nat_port_range_to_nfattr); +EXPORT_SYMBOL_GPL(nf_nat_port_range_to_nlattr); #endif /* Noone using conntrack by the time this called. */ diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 2e40cc83526a..d562290b1820 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -142,8 +142,8 @@ static struct nf_nat_protocol gre __read_mostly = { .in_range = gre_in_range, .unique_tuple = gre_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .range_to_nfattr = nf_nat_port_range_to_nfattr, - .nfattr_to_range = nf_nat_port_nfattr_to_range, + .range_to_nlattr = nf_nat_port_range_to_nlattr, + .nlattr_to_range = nf_nat_port_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index f71ef9b5f428..898d73771155 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -79,7 +79,7 @@ struct nf_nat_protocol nf_nat_protocol_icmp = { .in_range = icmp_in_range, .unique_tuple = icmp_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .range_to_nfattr = nf_nat_port_range_to_nfattr, - .nfattr_to_range = nf_nat_port_nfattr_to_range, + .range_to_nlattr = nf_nat_port_range_to_nlattr, + .nlattr_to_range = nf_nat_port_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c index 123c95913f28..5bbbb2acdc70 100644 --- a/net/ipv4/netfilter/nf_nat_proto_tcp.c +++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c @@ -145,7 +145,7 @@ struct nf_nat_protocol nf_nat_protocol_tcp = { .in_range = tcp_in_range, .unique_tuple = tcp_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .range_to_nfattr = nf_nat_port_range_to_nfattr, - .nfattr_to_range = nf_nat_port_nfattr_to_range, + .range_to_nlattr = nf_nat_port_range_to_nlattr, + .nlattr_to_range = nf_nat_port_nlattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c index 1c4c70e25cd4..a0af4fd95584 100644 --- a/net/ipv4/netfilter/nf_nat_proto_udp.c +++ b/net/ipv4/netfilter/nf_nat_proto_udp.c @@ -135,7 +135,7 @@ struct nf_nat_protocol nf_nat_protocol_udp = { .in_range = udp_in_range, .unique_tuple = udp_unique_tuple, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) - .range_to_nfattr = nf_nat_port_range_to_nfattr, - .nfattr_to_range = nf_nat_port_nfattr_to_range, + .range_to_nlattr = nf_nat_port_range_to_nlattr, + .nlattr_to_range = nf_nat_port_nlattr_to_range, #endif }; -- cgit v1.2.3 From f73e924cdd166360e8cc9a1b193008fdc9b3e3e2 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 28 Sep 2007 14:39:55 -0700 Subject: [NETFILTER]: ctnetlink: use netlink policy Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 10 ++++------ net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 12 +++++------- 2 files changed, 9 insertions(+), 13 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 77ca556aad91..2fcb9249a8da 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -373,9 +373,9 @@ nla_put_failure: return -1; } -static const size_t cta_min_ip[CTA_IP_MAX+1] = { - [CTA_IP_V4_SRC] = sizeof(u_int32_t), - [CTA_IP_V4_DST] = sizeof(u_int32_t), +static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = { + [CTA_IP_V4_SRC] = { .type = NLA_U32 }, + [CTA_IP_V4_DST] = { .type = NLA_U32 }, }; static int ipv4_nlattr_to_tuple(struct nlattr *tb[], @@ -384,9 +384,6 @@ static int ipv4_nlattr_to_tuple(struct nlattr *tb[], if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) return -EINVAL; - if (nlattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) - return -EINVAL; - t->src.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_SRC]); t->dst.u3.ip = *(__be32 *)nla_data(tb[CTA_IP_V4_DST]); @@ -413,6 +410,7 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) .tuple_to_nlattr = ipv4_tuple_to_nlattr, .nlattr_to_tuple = ipv4_nlattr_to_tuple, + .nla_policy = ipv4_nla_policy, #endif #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) .ctl_table_path = nf_net_ipv4_netfilter_sysctl_path, diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index ca7252c10758..11fedc73049c 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -248,10 +248,10 @@ nla_put_failure: return -1; } -static const size_t cta_min_proto[CTA_PROTO_MAX+1] = { - [CTA_PROTO_ICMP_TYPE] = sizeof(u_int8_t), - [CTA_PROTO_ICMP_CODE] = sizeof(u_int8_t), - [CTA_PROTO_ICMP_ID] = sizeof(u_int16_t) +static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { + [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 }, + [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 }, }; static int icmp_nlattr_to_tuple(struct nlattr *tb[], @@ -262,9 +262,6 @@ static int icmp_nlattr_to_tuple(struct nlattr *tb[], || !tb[CTA_PROTO_ICMP_ID]) return -EINVAL; - if (nlattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) - return -EINVAL; - tuple->dst.u.icmp.type = *(u_int8_t *)nla_data(tb[CTA_PROTO_ICMP_TYPE]); tuple->dst.u.icmp.code = @@ -329,6 +326,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) .tuple_to_nlattr = icmp_tuple_to_nlattr, .nlattr_to_tuple = icmp_nlattr_to_tuple, + .nla_policy = icmp_nla_policy, #endif #ifdef CONFIG_SYSCTL .ctl_table_header = &icmp_sysctl_header, -- cgit v1.2.3 From dc86967b54aaf64fb053cce83c05a4476d48583b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 1 Oct 2007 15:27:19 -0700 Subject: [TCP]: No fackets_out/highest_sack tuning when SACK isn't enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was found due to bug report from Cedric Le Goater though it turned this turned out to be unrelated bug. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 94c80113df7e..6199abeb0f5e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -660,7 +660,7 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb, int decr) { - if (!tp->sacked_out) + if (!tp->sacked_out || tcp_is_reno(tp)) return; if (!before(tp->highest_sack, TCP_SKB_CB(skb)->seq)) @@ -712,7 +712,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; - if (tp->sacked_out && (TCP_SKB_CB(skb)->seq == tp->highest_sack)) + if (tcp_is_sack(tp) && tp->sacked_out && + (TCP_SKB_CB(skb)->seq == tp->highest_sack)) tp->highest_sack = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ @@ -1718,7 +1719,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - if (WARN_ON(tp->sacked_out && + if (WARN_ON(tcp_is_sack(tp) && tp->sacked_out && (TCP_SKB_CB(next_skb)->seq == tp->highest_sack))) return; -- cgit v1.2.3 From 95eacd27e2a0924f1435654c06712cee6be099ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 1 Oct 2007 15:27:42 -0700 Subject: [TCP]: fix comments that got messed up during code move MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4b27739031fb..904289d2b6bb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1467,8 +1467,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ return flag; } -/* F-RTO can only be used if TCP has never retransmitted anything other than - * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) +/* If we receive more dupacks than we expected counting segments + * in assumption of absent reordering, interpret this as reordering. + * The only another reason could be bug in receiver TCP. */ static void tcp_check_reno_reordering(struct sock *sk, const int addend) { @@ -1516,6 +1517,9 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) tp->sacked_out = 0; } +/* F-RTO can only be used if TCP has never retransmitted anything other than + * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) + */ int tcp_use_frto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3 From 0e835331e3111e5a92eb3a852405ea71ca8fff97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 1 Oct 2007 15:28:17 -0700 Subject: [TCP]: Update comment of SACK block validator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just came across what RFC2018 states about generation of valid SACK blocks in case of reneging. Alter comment a bit to point out clearly. IMHO, there isn't any reason to change code because the validation is there for a purpose (counters will inform user about decision TCP made if this case ever surfaces). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 904289d2b6bb..c1339d88bbf3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1027,8 +1027,15 @@ static void tcp_update_reordering(struct sock *sk, const int metric, * SACK block range validation checks that the received SACK block fits to * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. * Note that SND.UNA is not included to the range though being valid because - * it means that the receiver is rather inconsistent with itself (reports - * SACK reneging when it should advance SND.UNA). + * it means that the receiver is rather inconsistent with itself reporting + * SACK reneging when it should advance SND.UNA. Such SACK block this is + * perfectly valid, however, in light of RFC2018 which explicitly states + * that "SACK block MUST reflect the newest segment. Even if the newest + * segment is going to be discarded ...", not that it looks very clever + * in case of head skb. Due to potentional receiver driven attacks, we + * choose to avoid immediate execution of a walk in write queue due to + * reneging and defer head skb's loss recovery to standard loss recovery + * procedure that will eventually trigger (nothing forbids us doing this). * * Implements also blockage to start_seq wrap-around. Problem lies in the * fact that though start_seq (s) is before end_seq (i.e., not reversed), -- cgit v1.2.3 From 3de96471bd7fb76406e975ef6387abe3a0698149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 1 Oct 2007 15:28:48 -0700 Subject: [TCP]: Wrap-safed reordering detection FRTO check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case somebody has a suggestion about a better place for this check, which must guarantee execution "early enough" (i.e, before the wrap can occur), I'm very open to them. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c1339d88bbf3..7c1a92ffa083 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3027,6 +3027,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, &seq_rtt); + /* Guarantee sacktag reordering detection against wrap-arounds */ + if (before(tp->frto_highmark, tp->snd_una)) + tp->frto_highmark = 0; if (tp->frto_counter) frto_cwnd = tcp_process_frto(sk, flag); -- cgit v1.2.3 From 29d0a309d11bac9e57af914d0d6a35cde0080861 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Sun, 7 Oct 2007 23:36:41 -0700 Subject: [TCP]: Fix two off-by-one errors in fackets_out adjusting logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1) Passing wrong skb to tcp_adjust_fackets_out could corrupt fastpath_cnt_hint as tcp_skb_pcount(next_skb) is not included to it if hint points exactly to the next_skb (it's lagging behind, see sacktag). 2) When fastpath_skb_hint is put backwards to avoid dangling skb reference, the skb's pcount must also be removed from count (not included like above). Reported by Cedric Le Goater Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6199abeb0f5e..53296753b0bd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1755,14 +1755,16 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m if (tcp_is_reno(tp) && tp->sacked_out) tcp_dec_pcount_approx(&tp->sacked_out, next_skb); - tcp_adjust_fackets_out(tp, skb, tcp_skb_pcount(next_skb)); + tcp_adjust_fackets_out(tp, next_skb, tcp_skb_pcount(next_skb)); tp->packets_out -= tcp_skb_pcount(next_skb); /* changed transmit queue under us so clear hints */ tcp_clear_retrans_hints_partial(tp); /* manually tune sacktag skb hint */ - if (tp->fastpath_skb_hint == next_skb) + if (tp->fastpath_skb_hint == next_skb) { tp->fastpath_skb_hint = skb; + tp->fastpath_cnt_hint -= tcp_skb_pcount(skb); + } sk_stream_free_skb(sk, next_skb); } -- cgit v1.2.3 From de83c058af25aa97ed4864abab11e90e8dead6e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Sun, 7 Oct 2007 23:37:55 -0700 Subject: [TCP]: "Annotate" another fackets_out state reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should no longer be necessary because fackets_out is accurate. It indicates bugs elsewhere, thus report it. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7c1a92ffa083..4268cd13ff9a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1160,7 +1160,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int first_sack_index; if (!tp->sacked_out) { - tp->fackets_out = 0; + if (WARN_ON(tp->fackets_out)) + tp->fackets_out = 0; tp->highest_sack = tp->snd_una; } prior_fackets = tp->fackets_out; -- cgit v1.2.3 From cfcabdcc2d5a810208e5bb3974121b7ed60119aa Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 9 Oct 2007 01:59:42 -0700 Subject: [NET]: sparse warning fixes Fix a bunch of sparse warnings. Mostly about 0 used as NULL pointer, and shadowed variable declarations. One notable case was that hash size should have been unsigned. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 5 ++--- net/ipv4/inet_lro.c | 2 +- net/ipv4/ip_sockglue.c | 6 +++--- net/ipv4/proc.c | 2 +- net/ipv4/route.c | 4 ++-- net/ipv4/tcp_input.c | 2 +- 6 files changed, 10 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index ad500a43b359..2b6e59c4c0d0 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1695,8 +1695,8 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]); } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST - struct in_device *in_dev = pmc->interface; struct ip_sf_list *psf; + in_dev = pmc->interface; #endif /* filter mode change */ @@ -1799,7 +1799,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, { int err; - if (iml->sflist == 0) { + if (iml->sflist == NULL) { /* any-source empty exclude case */ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, 0, NULL, 0); @@ -2167,7 +2167,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, return -EFAULT; } for (i=0; itcp_next_seq != ntohl(tcph->seq)) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b2b3053dfef7..f51f20e487c8 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -659,7 +659,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, break; } msf = kmalloc(optlen, GFP_KERNEL); - if (msf == 0) { + if (!msf) { err = -ENOBUFS; break; } @@ -816,7 +816,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, break; } gsf = kmalloc(optlen,GFP_KERNEL); - if (gsf == 0) { + if (!gsf) { err = -ENOBUFS; break; } @@ -836,7 +836,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, } msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); msf = kmalloc(msize,GFP_KERNEL); - if (msf == 0) { + if (!msf) { err = -ENOBUFS; goto mc_msf_out; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 9dee70e1cf0d..e5b05b039101 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -144,7 +144,7 @@ static struct { { "TimestampReps", ICMP_TIMESTAMPREPLY }, { "AddrMasks", ICMP_ADDRESS }, { "AddrMaskReps", ICMP_ADDRESSREPLY }, - { 0, 0 } + { NULL, 0 } }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2a9b363e820c..307e1f1107ca 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -246,7 +246,7 @@ static spinlock_t *rt_hash_locks; static struct rt_hash_bucket *rt_hash_table; static unsigned rt_hash_mask; -static int rt_hash_log; +static unsigned int rt_hash_log; static unsigned int rt_hash_rnd; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); @@ -593,7 +593,7 @@ static void rt_check_expire(struct work_struct *work) i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; - if (*rthp == 0) + if (*rthp == NULL) continue; spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4268cd13ff9a..e8c39488cf58 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2704,7 +2704,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) BUG_TRAP((int)tp->lost_out >= 0); BUG_TRAP((int)tp->retrans_out >= 0); if (!tp->packets_out && tcp_is_sack(tp)) { - const struct inet_connection_sock *icsk = inet_csk(sk); + icsk = inet_csk(sk); if (tp->lost_out) { printk(KERN_DEBUG "Leak l=%u %d\n", tp->lost_out, icsk->icsk_ca_state); -- cgit v1.2.3 From 4b7137ff8fb49d7bf22dfa248baa0d02ace2c43d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 8 Oct 2007 17:13:44 -0700 Subject: [IPSEC] esp: Remove keys from esp_data structure The keys are only used during initialisation so we don't need to carry them in esp_data. Since we don't have to allocate them again, there is no need to place a limit on the authentication key length anymore. This patch also kills the unused auth.icv member. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 98767a4f1185..d233e2e62500 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -343,11 +343,6 @@ static int esp_init_state(struct xfrm_state *x) struct crypto_blkcipher *tfm; u32 align; - /* null auth and encryption can have zero length keys */ - if (x->aalg) { - if (x->aalg->alg_key_len > 512) - goto error; - } if (x->ealg == NULL) goto error; @@ -359,15 +354,14 @@ static int esp_init_state(struct xfrm_state *x) struct xfrm_algo_desc *aalg_desc; struct crypto_hash *hash; - esp->auth.key = x->aalg->alg_key; - esp->auth.key_len = (x->aalg->alg_key_len+7)/8; hash = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(hash)) goto error; esp->auth.tfm = hash; - if (crypto_hash_setkey(hash, esp->auth.key, esp->auth.key_len)) + if (crypto_hash_setkey(hash, x->aalg->alg_key, + (x->aalg->alg_key_len + 7) / 8)) goto error; aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); @@ -389,8 +383,7 @@ static int esp_init_state(struct xfrm_state *x) if (!esp->auth.work_icv) goto error; } - esp->conf.key = x->ealg->alg_key; - esp->conf.key_len = (x->ealg->alg_key_len+7)/8; + tfm = crypto_alloc_blkcipher(x->ealg->alg_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) goto error; @@ -403,7 +396,8 @@ static int esp_init_state(struct xfrm_state *x) goto error; esp->conf.ivinitted = 0; } - if (crypto_blkcipher_setkey(tfm, esp->conf.key, esp->conf.key_len)) + if (crypto_blkcipher_setkey(tfm, x->ealg->alg_key, + (x->ealg->alg_key_len + 7) / 8)) goto error; x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen; if (x->props.mode == XFRM_MODE_TUNNEL) -- cgit v1.2.3 From bc31d3b2c7d7f2a03721a05cb3c9a3ce8b1e2e5a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 8 Oct 2007 17:14:34 -0700 Subject: [IPSEC] ah: Remove keys from ah_data structure The keys are only used during initialisation so we don't need to carry them in esp_data. Since we don't have to allocate them again, there is no need to place a limit on the authentication key length anymore. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ah4.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 39f6211f1496..dc1d8e871b24 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -219,10 +219,6 @@ static int ah_init_state(struct xfrm_state *x) if (!x->aalg) goto error; - /* null auth can use a zero length key */ - if (x->aalg->alg_key_len > 512) - goto error; - if (x->encap) goto error; @@ -230,14 +226,13 @@ static int ah_init_state(struct xfrm_state *x) if (ahp == NULL) return -ENOMEM; - ahp->key = x->aalg->alg_key; - ahp->key_len = (x->aalg->alg_key_len+7)/8; tfm = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) goto error; ahp->tfm = tfm; - if (crypto_hash_setkey(tfm, ahp->key, ahp->key_len)) + if (crypto_hash_setkey(tfm, x->aalg->alg_key, + (x->aalg->alg_key_len + 7) / 8)) goto error; /* -- cgit v1.2.3 From 406ef77c893ebd882209be4e393d64b01fe72054 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 8 Oct 2007 17:16:30 -0700 Subject: [IPSEC]: Move common output code to xfrm_output Most of the code in xfrm4_output_one and xfrm6_output_one are identical so this patch moves them into a common xfrm_output function which will live in net/xfrm. In fact this would seem to fix a bug as on IPv4 we never reset the network header after a transform which may upset netfilter later on. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/xfrm4_output.c | 40 ++++------------------------------------ 1 file changed, 4 insertions(+), 36 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 44ef208a75cb..04805c7d79c3 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -41,58 +40,27 @@ out: return ret; } -static int xfrm4_output_one(struct sk_buff *skb) +static inline int xfrm4_output_one(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; struct xfrm_state *x = dst->xfrm; int err; - if (skb->ip_summed == CHECKSUM_PARTIAL) { - err = skb_checksum_help(skb); - if (err) - goto error_nolock; - } - if (x->props.mode == XFRM_MODE_TUNNEL) { err = xfrm4_tunnel_check_size(skb); if (err) goto error_nolock; } - do { - spin_lock_bh(&x->lock); - err = xfrm_state_check(x, skb); - if (err) - goto error; - - err = x->mode->output(x, skb); - if (err) - goto error; - - err = x->type->output(x, skb); - if (err) - goto error; - - x->curlft.bytes += skb->len; - x->curlft.packets++; - - spin_unlock_bh(&x->lock); - - if (!(skb->dst = dst_pop(dst))) { - err = -EHOSTUNREACH; - goto error_nolock; - } - dst = skb->dst; - x = dst->xfrm; - } while (x && (x->props.mode != XFRM_MODE_TUNNEL)); + err = xfrm_output(skb); + if (err) + goto error_nolock; IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; err = 0; out_exit: return err; -error: - spin_unlock_bh(&x->lock); error_nolock: kfree_skb(skb); goto out_exit; -- cgit v1.2.3 From 436a0a402203d5a47d2edf7e4dde6c08a7257983 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 8 Oct 2007 17:25:53 -0700 Subject: [IPSEC]: Move output replay code into xfrm_output The replay counter is one of only two remaining things in the output code that requires a lock on the xfrm state (the other being the crypto). This patch moves it into the generic xfrm_output so we can remove the lock from the transforms themselves. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ah4.c | 4 ++-- net/ipv4/esp4.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index dc1d8e871b24..58af298e1941 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -96,8 +96,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) ah->reserved = 0; ah->spi = x->id.spi; - ah->seq_no = htonl(++x->replay.oseq); - xfrm_aevent_doreplay(x); + ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq); err = ah_mac_digest(ahp, skb, ah->auth_data); if (err) goto error; @@ -297,6 +296,7 @@ static struct xfrm_type ah_type = .description = "AH4", .owner = THIS_MODULE, .proto = IPPROTO_AH, + .flags = XFRM_TYPE_REPLAY_PROT, .init_state = ah_init_state, .destructor = ah_destroy, .input = ah_input, diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d233e2e62500..0f62af9a7f15 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -95,8 +95,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->protocol = IPPROTO_ESP; esph->spi = x->id.spi; - esph->seq_no = htonl(++x->replay.oseq); - xfrm_aevent_doreplay(x); + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq); if (esp->conf.ivlen) { if (unlikely(!esp->conf.ivinitted)) { @@ -437,6 +436,7 @@ static struct xfrm_type esp_type = .description = "ESP4", .owner = THIS_MODULE, .proto = IPPROTO_ESP, + .flags = XFRM_TYPE_REPLAY_PROT, .init_state = esp_init_state, .destructor = esp_destroy, .get_mtu = esp4_get_mtu, -- cgit v1.2.3 From 007f0211a8872f32381f5d44becf8eb2f27f3c30 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 9 Oct 2007 13:25:59 -0700 Subject: [IPSEC]: Store IPv6 nh pointer in mac_header on output Current the x->mode->output functions store the IPv6 nh pointer in the skb network header. This is inconvenient because the network header then has to be fixed up before the packet can leave the IPsec stack. The mac header field is unused on output so we can use that to store this instead. This patch does that and removes the network header fix-up in xfrm_output. It also uses ipv6_hdr where appropriate in the x->type->output functions. There is also a minor clean-up in esp4 to make it use the same code as esp6 to help any subsequent effort to merge the two. Lastly it kills two redundant skb_set_* statements in BEET that were simply copied over from transport mode. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 0f62af9a7f15..ffd565350411 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -59,7 +59,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) tail[clen - skb->len - 2] = (clen - skb->len) - 2; pskb_put(skb, trailer, clen - skb->len); - __skb_push(skb, skb->data - skb_network_header(skb)); + __skb_push(skb, -skb_network_offset(skb)); top_iph = ip_hdr(skb); esph = (struct ip_esp_hdr *)(skb_network_header(skb) + top_iph->ihl * 4); -- cgit v1.2.3 From b7c6538cd84f8072fad43bfce530f5bf695edbba Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 9 Oct 2007 13:33:35 -0700 Subject: [IPSEC]: Move state lock into x->type->output This patch releases the lock on the state before calling x->type->output. It also adds the lock to the spots where they're currently needed. Most of those places (all except mip6) are expected to disappear with async crypto. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ah4.c | 7 ++++++- net/ipv4/esp4.c | 10 ++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 58af298e1941..3513149c3843 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -97,10 +98,14 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) ah->reserved = 0; ah->spi = x->id.spi; ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq); + + spin_lock_bh(&x->lock); err = ah_mac_digest(ahp, skb, ah->auth_data); + memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len); + spin_unlock_bh(&x->lock); + if (err) goto error; - memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; top_iph->ttl = iph->ttl; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index ffd565350411..452910dae89f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -66,6 +67,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->tot_len = htons(skb->len + alen); *(skb_tail_pointer(trailer) - 1) = top_iph->protocol; + spin_lock_bh(&x->lock); + /* this is non-NULL only with UDP Encapsulation */ if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; @@ -111,7 +114,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) if (unlikely(nfrags > ESP_NUM_FAST_SG)) { sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); if (!sg) - goto error; + goto unlock; } skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen); err = crypto_blkcipher_encrypt(&desc, sg, sg, clen); @@ -120,7 +123,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) } while (0); if (unlikely(err)) - goto error; + goto unlock; if (esp->conf.ivlen) { memcpy(esph->enc_data, esp->conf.ivec, esp->conf.ivlen); @@ -133,6 +136,9 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(pskb_put(skb, trailer, alen), esp->auth.work_icv, alen); } +unlock: + spin_unlock_bh(&x->lock); + ip_send_check(top_iph); error: -- cgit v1.2.3 From cf7732e4cc14b56d593ff53352673e1fd5e3ba52 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 10 Oct 2007 02:29:29 -0700 Subject: [NET]: Make core networking code use seq_open_private This concerns the ipv4 and ipv6 code mostly, but also the netlink and unix sockets. The netlink code is an example of how to use the __seq_open_private() call - it saves the net namespace on this private. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/arp.c | 20 ++------------------ net/ipv4/fib_hash.c | 20 ++------------------ net/ipv4/fib_trie.c | 42 ++++-------------------------------------- net/ipv4/igmp.c | 38 ++++---------------------------------- net/ipv4/ipmr.c | 43 ++++--------------------------------------- net/ipv4/ipvs/ip_vs_ctl.c | 20 ++------------------ net/ipv4/raw.c | 20 ++------------------ net/ipv4/route.c | 19 ++----------------- 8 files changed, 22 insertions(+), 200 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index d8248198bcd7..36d6798947b5 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1378,24 +1378,8 @@ static const struct seq_operations arp_seq_ops = { static int arp_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct neigh_seq_state *s = kzalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &arp_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &arp_seq_ops, + sizeof(struct neigh_seq_state)); } static const struct file_operations arp_seq_fops = { diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 9fafbeea8fe6..527a6e0af5b6 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -1039,24 +1039,8 @@ static const struct seq_operations fib_seq_ops = { static int fib_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct fib_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &fib_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &fib_seq_ops, + sizeof(struct fib_iter_state)); } static const struct file_operations fib_seq_fops = { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index be34bd556d58..81a8285d6d6a 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2379,25 +2379,8 @@ static const struct seq_operations fib_trie_seq_ops = { static int fib_trie_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &fib_trie_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; - memset(s, 0, sizeof(*s)); -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &fib_trie_seq_ops, + sizeof(struct fib_trie_iter)); } static const struct file_operations fib_trie_fops = { @@ -2500,25 +2483,8 @@ static const struct seq_operations fib_route_seq_ops = { static int fib_route_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &fib_route_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; - memset(s, 0, sizeof(*s)); -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &fib_route_seq_ops, + sizeof(struct fib_trie_iter)); } static const struct file_operations fib_route_fops = { diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 2b6e59c4c0d0..7dbc282d4f9f 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2410,23 +2410,8 @@ static const struct seq_operations igmp_mc_seq_ops = { static int igmp_mc_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct igmp_mc_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - rc = seq_open(file, &igmp_mc_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &igmp_mc_seq_ops, + sizeof(struct igmp_mc_iter_state)); } static const struct file_operations igmp_mc_seq_fops = { @@ -2584,23 +2569,8 @@ static const struct seq_operations igmp_mcf_seq_ops = { static int igmp_mcf_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct igmp_mcf_iter_state *s = kzalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - rc = seq_open(file, &igmp_mcf_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &igmp_mcf_seq_ops, + sizeof(struct igmp_mcf_iter_state)); } static const struct file_operations igmp_mcf_seq_fops = { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index b8b4b497fb57..37bb497d92af 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1714,26 +1714,8 @@ static const struct seq_operations ipmr_vif_seq_ops = { static int ipmr_vif_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &ipmr_vif_seq_ops); - if (rc) - goto out_kfree; - - s->ct = 0; - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; - + return seq_open_private(file, &ipmr_vif_seq_ops, + sizeof(struct ipmr_vif_iter)); } static const struct file_operations ipmr_vif_fops = { @@ -1877,25 +1859,8 @@ static const struct seq_operations ipmr_mfc_seq_ops = { static int ipmr_mfc_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &ipmr_mfc_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; - + return seq_open_private(file, &ipmr_mfc_seq_ops, + sizeof(struct ipmr_mfc_iter)); } static const struct file_operations ipmr_mfc_fops = { diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 61d023d58b5d..7345fc252a23 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1792,24 +1792,8 @@ static const struct seq_operations ip_vs_info_seq_ops = { static int ip_vs_info_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct ip_vs_iter *s = kzalloc(sizeof(*s), GFP_KERNEL); - - if (!s) - goto out; - - rc = seq_open(file, &ip_vs_info_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &ip_vs_info_seq_ops, + sizeof(struct ip_vs_iter)); } static const struct file_operations ip_vs_info_fops = { diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 07070c7067f3..3916faca3afe 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -902,24 +902,8 @@ static const struct seq_operations raw_seq_ops = { static int raw_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct raw_iter_state *s; - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) - goto out; - rc = seq_open(file, &raw_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &raw_seq_ops, + sizeof(struct raw_iter_state)); } static const struct file_operations raw_seq_fops = { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 307e1f1107ca..21b12de9e653 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -375,23 +375,8 @@ static const struct seq_operations rt_cache_seq_ops = { static int rt_cache_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - struct rt_cache_iter_state *s; - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) - goto out; - rc = seq_open(file, &rt_cache_seq_ops); - if (rc) - goto out_kfree; - seq = file->private_data; - seq->private = s; -out: - return rc; -out_kfree: - kfree(s); - goto out; + return seq_open_private(file, &rt_cache_seq_ops, + sizeof(struct rt_cache_iter_state)); } static const struct file_operations rt_cache_seq_fops = { -- cgit v1.2.3 From e2da59133880976586b2d9d81d798222ecafa566 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 10 Oct 2007 02:29:58 -0700 Subject: [NETFILTER]: Make netfilter code use the seq_open_private Just switch to the consolidated calls. ipt_recent() has to initialize the private, so use the __seq_open_private() helper. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_recent.c | 15 ++------- .../netfilter/nf_conntrack_l3proto_ipv4_compat.c | 36 +++------------------- 2 files changed, 6 insertions(+), 45 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index db2a79889f9a..11d39fb5f38b 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -381,25 +381,14 @@ static const struct seq_operations recent_seq_ops = { static int recent_seq_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); - struct seq_file *seq; struct recent_iter_state *st; - int ret; - st = kzalloc(sizeof(*st), GFP_KERNEL); + st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); if (st == NULL) return -ENOMEM; - ret = seq_open(file, &recent_seq_ops); - if (ret) { - kfree(st); - goto out; - } - st->table = pde->data; - seq = file->private_data; - seq->private = st; -out: - return ret; + return 0; } static ssize_t recent_proc_write(struct file *file, const char __user *input, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index a5ae2eabf0f3..741f3dfaa5a1 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -174,22 +174,8 @@ static const struct seq_operations ct_seq_ops = { static int ct_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - struct ct_iter_state *st; - int ret; - - st = kzalloc(sizeof(struct ct_iter_state), GFP_KERNEL); - if (st == NULL) - return -ENOMEM; - ret = seq_open(file, &ct_seq_ops); - if (ret) - goto out_free; - seq = file->private_data; - seq->private = st; - return ret; -out_free: - kfree(st); - return ret; + return seq_open_private(file, &ct_seq_ops, + sizeof(struct ct_iter_state)); } static const struct file_operations ct_file_ops = { @@ -291,22 +277,8 @@ static const struct seq_operations exp_seq_ops = { static int exp_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - struct ct_expect_iter_state *st; - int ret; - - st = kzalloc(sizeof(struct ct_expect_iter_state), GFP_KERNEL); - if (!st) - return -ENOMEM; - ret = seq_open(file, &exp_seq_ops); - if (ret) - goto out_free; - seq = file->private_data; - seq->private = st; - return ret; -out_free: - kfree(st); - return ret; + return seq_open_private(file, &exp_seq_ops, + sizeof(struct ct_expect_iter_state)); } static const struct file_operations ip_exp_file_ops = { -- cgit v1.2.3 From 1c1e87edb9aa38b9e1ae807a6d88fcfd350a55c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 10 Oct 2007 02:45:32 -0700 Subject: [TCP]: Separate lost_retrans loop into own function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follows own function for each task principle, this is really somewhat separate task being done in sacktag. Also reduces indentation. In addition, added ack_seq local var to break some long lines & fixed coding style things. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 80 ++++++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 37 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e8c39488cf58..101054cb13e9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1106,6 +1106,47 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, return !before(start_seq, end_seq - tp->max_window); } +/* Check for lost retransmit. This superb idea is borrowed from "ratehalving". + * Event "C". Later note: FACK people cheated me again 8), we have to account + * for reordering! Ugly, but should help. + */ +static int tcp_mark_lost_retrans(struct sock *sk, u32 lost_retrans) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int flag = 0; + + tcp_for_write_queue(skb, sk) { + u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; + + if (skb == tcp_send_head(sk)) + break; + if (after(TCP_SKB_CB(skb)->seq, lost_retrans)) + break; + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + continue; + + if ((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) && + after(lost_retrans, ack_seq) && + (tcp_is_fack(tp) || + !before(lost_retrans, + ack_seq + tp->reordering * tp->mss_cache))) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + + if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { + tp->lost_out += tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + flag |= FLAG_DATA_SACKED; + NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); + } + } + } + return flag; +} static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, @@ -1422,43 +1463,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } } - /* Check for lost retransmit. This superb idea is - * borrowed from "ratehalving". Event "C". - * Later note: FACK people cheated me again 8), - * we have to account for reordering! Ugly, - * but should help. - */ - if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) { - struct sk_buff *skb; - - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - if (after(TCP_SKB_CB(skb)->seq, lost_retrans)) - break; - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) - continue; - if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) && - after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) && - (tcp_is_fack(tp) || - !before(lost_retrans, - TCP_SKB_CB(skb)->ack_seq + tp->reordering * - tp->mss_cache))) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - - /* clear lost hint */ - tp->retransmit_skb_hint = NULL; - - if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { - tp->lost_out += tcp_skb_pcount(skb); - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - flag |= FLAG_DATA_SACKED; - NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); - } - } - } - } + if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) + flag |= tcp_mark_lost_retrans(sk, lost_retrans); tcp_verify_left_out(tp); -- cgit v1.2.3 From 8bd170750400bfa5e14c3dd2e2d0f305e1ab0e57 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 10 Oct 2007 15:41:41 -0700 Subject: [IPSEC] esp: Remove NAT-T checksum invalidation for BEET I pointed this out back when this patch was first proposed but it looks like it got lost along the way. The checksum only needs to be ignored for NAT-T in transport mode where we lose the original inner addresses due to NAT. With BEET the inner addresses will be intact so the checksum remains valid. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 452910dae89f..1af332df72d9 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -261,8 +261,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) * as per draft-ietf-ipsec-udp-encaps-06, * section 3.1.2 */ - if (x->props.mode == XFRM_MODE_TRANSPORT || - x->props.mode == XFRM_MODE_BEET) + if (x->props.mode == XFRM_MODE_TRANSPORT) skb->ip_summed = CHECKSUM_UNNECESSARY; } -- cgit v1.2.3 From 7b277b1a5fb147cb828e5d8b9780cee60f31a9bf Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 10 Oct 2007 15:44:06 -0700 Subject: [IPSEC]: Set skb->data to payload in x->mode->output This patch changes the calling convention so that on entry from x->mode->output and before entry into x->type->output skb->data will point to the payload instead of the IP header. This is essentially a redistribution of skb_push/skb_pull calls with the aim of minimising them on the common path of tunnel + ESP. It'll also let us use the same calling convention between IPv4 and IPv6 with the next patch. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ah4.c | 1 + net/ipv4/esp4.c | 6 ++---- net/ipv4/ipcomp.c | 1 + net/ipv4/xfrm4_mode_beet.c | 5 +++-- net/ipv4/xfrm4_mode_transport.c | 4 ++-- net/ipv4/xfrm4_mode_tunnel.c | 3 +-- net/ipv4/xfrm4_tunnel.c | 1 + 7 files changed, 11 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 3513149c3843..dbb1f11721e4 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -66,6 +66,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) char buf[60]; } tmp_iph; + skb_push(skb, -skb_network_offset(skb)); top_iph = ip_hdr(skb); iph = &tmp_iph.iph; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 1af332df72d9..0f5e8387ccb4 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -28,9 +28,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) int alen; int nfrags; - /* Strip IP+ESP header. */ - __skb_pull(skb, skb_transport_offset(skb)); - /* Now skb is pure payload to encrypt */ + /* skb is pure payload to encrypt */ err = -ENOMEM; @@ -60,7 +58,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) tail[clen - skb->len - 2] = (clen - skb->len) - 2; pskb_put(skb, trailer, clen - skb->len); - __skb_push(skb, -skb_network_offset(skb)); + skb_push(skb, -skb_network_offset(skb)); top_iph = ip_hdr(skb); esph = (struct ip_esp_hdr *)(skb_network_header(skb) + top_iph->ihl * 4); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index e787044a8514..1929d451dab5 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -134,6 +134,7 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) int hdr_len = 0; struct iphdr *iph = ip_hdr(skb); + skb_push(skb, -skb_network_offset(skb)); iph->tot_len = htons(skb->len); hdr_len = iph->ihl * 4; if ((skb->len - hdr_len) < ipcd->threshold) { diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index a73e710740c2..77888f596737 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -40,10 +40,11 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) if (unlikely(optlen)) hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4); - skb_push(skb, x->props.header_len - IPV4_BEET_PHMAXLEN + hdrlen); - skb_reset_network_header(skb); + skb_set_network_header(skb, IPV4_BEET_PHMAXLEN - x->props.header_len - + hdrlen); top_iph = ip_hdr(skb); skb->transport_header += sizeof(*iph) - hdrlen; + __skb_pull(skb, sizeof(*iph) - hdrlen); memmove(top_iph, iph, sizeof(*iph)); if (unlikely(optlen)) { diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 601047161ea6..10499d2ec65e 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -27,8 +27,8 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) int ihl = iph->ihl * 4; skb->transport_header = skb->network_header + ihl; - skb_push(skb, x->props.header_len); - skb_reset_network_header(skb); + skb_set_network_header(skb, -x->props.header_len); + __skb_pull(skb, ihl); memmove(skb_network_header(skb), iph, ihl); return 0; } diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 9963700e74c1..bac1a91f0cbe 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -49,8 +49,7 @@ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) iph = ip_hdr(skb); skb->transport_header = skb->network_header; - skb_push(skb, x->props.header_len); - skb_reset_network_header(skb); + skb_set_network_header(skb, -x->props.header_len); top_iph = ip_hdr(skb); top_iph->ihl = 5; diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 9275c79119b6..be572f918b5e 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -14,6 +14,7 @@ static int ipip_output(struct xfrm_state *x, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); + skb_push(skb, -skb_network_offset(skb)); iph->tot_len = htons(skb->len); ip_send_check(iph); -- cgit v1.2.3 From 37fedd3aab6517daec628764c5d66dd8761fbe5f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 10 Oct 2007 15:44:44 -0700 Subject: [IPSEC]: Use IPv6 calling convention as the convention for x->mode->output The IPv6 calling convention for x->mode->output is more general and could help an eventual protocol-generic x->type->output implementation. This patch adopts it for IPv4 as well and modifies the IPv4 type output functions accordingly. It also rewrites the IPv6 mac/transport header calculation to be based off the network header where practical. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ah4.c | 6 +++--- net/ipv4/esp4.c | 11 +++++------ net/ipv4/ipcomp.c | 10 +++++----- net/ipv4/xfrm4_mode_beet.c | 17 +++++++---------- net/ipv4/xfrm4_mode_transport.c | 7 +++---- net/ipv4/xfrm4_mode_tunnel.c | 7 +++---- 6 files changed, 26 insertions(+), 32 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index dbb1f11721e4..e4f7aa39978d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -82,14 +82,14 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) goto error; } - ah = (struct ip_auth_hdr *)((char *)top_iph+top_iph->ihl*4); - ah->nexthdr = top_iph->protocol; + ah = (struct ip_auth_hdr *)skb_transport_header(skb); + ah->nexthdr = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_AH; top_iph->tos = 0; top_iph->tot_len = htons(skb->len); top_iph->frag_off = 0; top_iph->ttl = 0; - top_iph->protocol = IPPROTO_AH; top_iph->check = 0; ahp = x->data; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 0f5e8387ccb4..93153d105619 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -60,10 +60,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, -skb_network_offset(skb)); top_iph = ip_hdr(skb); - esph = (struct ip_esp_hdr *)(skb_network_header(skb) + - top_iph->ihl * 4); + esph = (struct ip_esp_hdr *)skb_transport_header(skb); top_iph->tot_len = htons(skb->len + alen); - *(skb_tail_pointer(trailer) - 1) = top_iph->protocol; + *(skb_tail_pointer(trailer) - 1) = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_ESP; spin_lock_bh(&x->lock); @@ -91,9 +91,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) break; } - top_iph->protocol = IPPROTO_UDP; - } else - top_iph->protocol = IPPROTO_ESP; + *skb_mac_header(skb) = IPPROTO_UDP; + } esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 1929d451dab5..bf74f64fe5fb 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -98,10 +98,10 @@ out: static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) { struct ipcomp_data *ipcd = x->data; - const int ihlen = ip_hdrlen(skb); + const int ihlen = skb_transport_offset(skb); const int plen = skb->len - ihlen; int dlen = IPCOMP_SCRATCH_SIZE; - u8 *start = skb->data + ihlen; + u8 *start = skb_transport_header(skb); const int cpu = get_cpu(); u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu); struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu); @@ -154,11 +154,11 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) /* Install ipcomp header, convert into ipcomp datagram. */ iph->tot_len = htons(skb->len); - ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4); - ipch->nexthdr = iph->protocol; + ipch = (struct ip_comp_hdr *)skb_transport_header(skb); + ipch->nexthdr = *skb_mac_header(skb); ipch->flags = 0; ipch->cpi = htons((u16 )ntohl(x->id.spi)); - iph->protocol = IPPROTO_COMP; + *skb_mac_header(skb) = IPPROTO_COMP; ip_send_check(iph); return 0; diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index 77888f596737..7226c6486c01 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -23,17 +23,14 @@ * The following fields in it shall be filled in by x->type->output: * tot_len * check - * - * On exit, skb->h will be set to the start of the payload to be processed - * by x->type->output and skb->nh will be set to the top IP header. */ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) { + struct ip_beet_phdr *ph; struct iphdr *iph, *top_iph; int hdrlen, optlen; iph = ip_hdr(skb); - skb->transport_header = skb->network_header; hdrlen = 0; optlen = iph->ihl * 4 - sizeof(*iph); @@ -42,17 +39,17 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) skb_set_network_header(skb, IPV4_BEET_PHMAXLEN - x->props.header_len - hdrlen); - top_iph = ip_hdr(skb); - skb->transport_header += sizeof(*iph) - hdrlen; - __skb_pull(skb, sizeof(*iph) - hdrlen); + skb->mac_header = skb->network_header + + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + sizeof(*iph); + + ph = (struct ip_beet_phdr *)__skb_pull(skb, sizeof(*iph) - hdrlen); + top_iph = ip_hdr(skb); memmove(top_iph, iph, sizeof(*iph)); if (unlikely(optlen)) { - struct ip_beet_phdr *ph; - BUG_ON(optlen < 0); - ph = (struct ip_beet_phdr *)skb_transport_header(skb); ph->padlen = 4 - (optlen & 4); ph->hdrlen = optlen / 8; ph->nexthdr = top_iph->protocol; diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 10499d2ec65e..fd840c7d75ea 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -17,17 +17,16 @@ * * The IP header will be moved forward to make space for the encapsulation * header. - * - * On exit, skb->h will be set to the start of the payload to be processed - * by x->type->output and skb->nh will be set to the top IP header. */ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); int ihl = iph->ihl * 4; - skb->transport_header = skb->network_header + ihl; skb_set_network_header(skb, -x->props.header_len); + skb->mac_header = skb->network_header + + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + ihl; __skb_pull(skb, ihl); memmove(skb_network_header(skb), iph, ihl); return 0; diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index bac1a91f0cbe..f1d41ea34785 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -35,9 +35,6 @@ static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) * in it shall be filled in by x->type->output: * tot_len * check - * - * On exit, skb->h will be set to the start of the payload to be processed - * by x->type->output and skb->nh will be set to the top IP header. */ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { @@ -47,9 +44,11 @@ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) int flags; iph = ip_hdr(skb); - skb->transport_header = skb->network_header; skb_set_network_header(skb, -x->props.header_len); + skb->mac_header = skb->network_header + + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + sizeof(*iph); top_iph = ip_hdr(skb); top_iph->ihl = 5; -- cgit v1.2.3 From 87bdc48d304191313203df9b98d783e1ab5a55ab Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 10 Oct 2007 15:45:25 -0700 Subject: [IPSEC]: Get rid of ipv6_{auth,esp,comp}_hdr This patch removes the duplicate ipv6_{auth,esp,comp}_hdr structures since they're identical to the IPv4 versions. Duplicating them would only create problems for ourselves later when we need to add things like extended sequence numbers. I've also added transport header type conversion headers for these types which are now used by the transforms. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ah4.c | 18 +++++++++--------- net/ipv4/esp4.c | 10 +++++----- net/ipv4/ipcomp.c | 2 +- 3 files changed, 15 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index e4f7aa39978d..d69706405d58 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -82,7 +82,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) goto error; } - ah = (struct ip_auth_hdr *)skb_transport_header(skb); + ah = ip_auth_hdr(skb); ah->nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_AH; @@ -93,8 +93,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->check = 0; ahp = x->data; - ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + - ahp->icv_trunc_len) >> 2) - 2; + ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; @@ -134,15 +133,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) struct ah_data *ahp; char work_buf[60]; - if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) + if (!pskb_may_pull(skb, sizeof(*ah))) goto out; - ah = (struct ip_auth_hdr*)skb->data; + ah = (struct ip_auth_hdr *)skb->data; ahp = x->data; ah_hlen = (ah->hdrlen + 2) << 2; - if (ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_full_len) && - ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len)) + if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) goto out; if (!pskb_may_pull(skb, ah_hlen)) @@ -156,7 +155,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; - ah = (struct ip_auth_hdr*)skb->data; + ah = (struct ip_auth_hdr *)skb->data; iph = ip_hdr(skb); ihl = skb->data - skb_network_header(skb); @@ -266,7 +265,8 @@ static int ah_init_state(struct xfrm_state *x) if (!ahp->work_icv) goto error; - x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); + x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); if (x->props.mode == XFRM_MODE_TUNNEL) x->props.header_len += sizeof(struct iphdr); x->data = ahp; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 93153d105619..66eb4968b910 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -60,7 +60,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, -skb_network_offset(skb)); top_iph = ip_hdr(skb); - esph = (struct ip_esp_hdr *)skb_transport_header(skb); + esph = ip_esp_hdr(skb); top_iph->tot_len = htons(skb->len + alen); *(skb_tail_pointer(trailer) - 1) = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_ESP; @@ -157,7 +157,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) struct sk_buff *trailer; int blksize = ALIGN(crypto_blkcipher_blocksize(tfm), 4); int alen = esp->auth.icv_trunc_len; - int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; + int elen = skb->len - sizeof(*esph) - esp->conf.ivlen - alen; int nfrags; int ihl; u8 nexthdr[2]; @@ -165,7 +165,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) int padlen; int err; - if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) + if (!pskb_may_pull(skb, sizeof(*esph))) goto out; if (elen <= 0 || (elen & (blksize-1))) @@ -193,7 +193,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; - esph = (struct ip_esp_hdr*)skb->data; + esph = (struct ip_esp_hdr *)skb->data; /* Get ivec. This can be wrong, check against another impls. */ if (esp->conf.ivlen) @@ -206,7 +206,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) if (!sg) goto out; } - skb_to_sgvec(skb, sg, sizeof(struct ip_esp_hdr) + esp->conf.ivlen, elen); + skb_to_sgvec(skb, sg, sizeof(*esph) + esp->conf.ivlen, elen); err = crypto_blkcipher_decrypt(&desc, sg, sg, elen); if (unlikely(sg != &esp->sgbuf[0])) kfree(sg); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index bf74f64fe5fb..78d6ddb02d1d 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -154,7 +154,7 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) /* Install ipcomp header, convert into ipcomp datagram. */ iph->tot_len = htons(skb->len); - ipch = (struct ip_comp_hdr *)skb_transport_header(skb); + ipch = ip_comp_hdr(skb); ipch->nexthdr = *skb_mac_header(skb); ipch->flags = 0; ipch->cpi = htons((u16 )ntohl(x->id.spi)); -- cgit v1.2.3 From ceb1eec8291175686d0208e66595ff83bc0624e2 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 10 Oct 2007 15:45:52 -0700 Subject: [IPSEC]: Move IP length/checksum setting out of transforms This patch moves the setting of the IP length and checksum fields out of the transforms and into the xfrmX_output functions. This would help future efforts in merging the transforms themselves. It also adds an optimisation to ipcomp due to the fact that the transport offset is guaranteed to be zero. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ah4.c | 2 -- net/ipv4/esp4.c | 7 +------ net/ipv4/ipcomp.c | 22 +++++----------------- net/ipv4/xfrm4_mode_beet.c | 3 --- net/ipv4/xfrm4_mode_tunnel.c | 5 +---- net/ipv4/xfrm4_output.c | 5 +++++ net/ipv4/xfrm4_tunnel.c | 5 ----- 7 files changed, 12 insertions(+), 37 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index d69706405d58..60925fedbf16 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -115,8 +115,6 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } - ip_send_check(top_iph); - err = 0; error: diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 66eb4968b910..8377bedf3f66 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -16,7 +16,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) { int err; - struct iphdr *top_iph; struct ip_esp_hdr *esph; struct crypto_blkcipher *tfm; struct blkcipher_desc desc; @@ -59,9 +58,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) pskb_put(skb, trailer, clen - skb->len); skb_push(skb, -skb_network_offset(skb)); - top_iph = ip_hdr(skb); esph = ip_esp_hdr(skb); - top_iph->tot_len = htons(skb->len + alen); *(skb_tail_pointer(trailer) - 1) = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_ESP; @@ -76,7 +73,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) uh = (struct udphdr *)esph; uh->source = encap->encap_sport; uh->dest = encap->encap_dport; - uh->len = htons(skb->len + alen - top_iph->ihl*4); + uh->len = htons(skb->len + alen - skb_transport_offset(skb)); uh->check = 0; switch (encap->encap_type) { @@ -136,8 +133,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) unlock: spin_unlock_bh(&x->lock); - ip_send_check(top_iph); - error: return err; } diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 78d6ddb02d1d..32b02deca2ec 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -98,10 +98,9 @@ out: static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) { struct ipcomp_data *ipcd = x->data; - const int ihlen = skb_transport_offset(skb); - const int plen = skb->len - ihlen; + const int plen = skb->len; int dlen = IPCOMP_SCRATCH_SIZE; - u8 *start = skb_transport_header(skb); + u8 *start = skb->data; const int cpu = get_cpu(); u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu); struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu); @@ -118,7 +117,7 @@ static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen); put_cpu(); - pskb_trim(skb, ihlen + dlen + sizeof(struct ip_comp_hdr)); + pskb_trim(skb, dlen + sizeof(struct ip_comp_hdr)); return 0; out: @@ -131,13 +130,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) int err; struct ip_comp_hdr *ipch; struct ipcomp_data *ipcd = x->data; - int hdr_len = 0; - struct iphdr *iph = ip_hdr(skb); - skb_push(skb, -skb_network_offset(skb)); - iph->tot_len = htons(skb->len); - hdr_len = iph->ihl * 4; - if ((skb->len - hdr_len) < ipcd->threshold) { + if (skb->len < ipcd->threshold) { /* Don't bother compressing */ goto out_ok; } @@ -146,25 +140,19 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) goto out_ok; err = ipcomp_compress(x, skb); - iph = ip_hdr(skb); if (err) { goto out_ok; } /* Install ipcomp header, convert into ipcomp datagram. */ - iph->tot_len = htons(skb->len); ipch = ip_comp_hdr(skb); ipch->nexthdr = *skb_mac_header(skb); ipch->flags = 0; ipch->cpi = htons((u16 )ntohl(x->id.spi)); *skb_mac_header(skb) = IPPROTO_COMP; - ip_send_check(iph); - return 0; - out_ok: - if (x->props.mode == XFRM_MODE_TUNNEL) - ip_send_check(iph); + skb_push(skb, -skb_network_offset(skb)); return 0; } diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index 7226c6486c01..73d2338bec55 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -20,9 +20,6 @@ /* Add encapsulation header. * * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt. - * The following fields in it shall be filled in by x->type->output: - * tot_len - * check */ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) { diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index f1d41ea34785..1ae9d32276f0 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -31,10 +31,7 @@ static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) /* Add encapsulation header. * - * The top IP header will be constructed per RFC 2401. The following fields - * in it shall be filled in by x->type->output: - * tot_len - * check + * The top IP header will be constructed per RFC 2401. */ static int xfrm4_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 04805c7d79c3..434ef302ba83 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -44,6 +44,7 @@ static inline int xfrm4_output_one(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; struct xfrm_state *x = dst->xfrm; + struct iphdr *iph; int err; if (x->props.mode == XFRM_MODE_TUNNEL) { @@ -56,6 +57,10 @@ static inline int xfrm4_output_one(struct sk_buff *skb) if (err) goto error_nolock; + iph = ip_hdr(skb); + iph->tot_len = htons(skb->len); + ip_send_check(iph); + IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; err = 0; diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index be572f918b5e..e1fafc1562d8 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -12,12 +12,7 @@ static int ipip_output(struct xfrm_state *x, struct sk_buff *skb) { - struct iphdr *iph = ip_hdr(skb); - skb_push(skb, -skb_network_offset(skb)); - iph->tot_len = htons(skb->len); - ip_send_check(iph); - return 0; } -- cgit v1.2.3 From 631a6698d09e57cadc069914d613899609a0ae83 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 10 Oct 2007 15:46:21 -0700 Subject: [IPSEC]: Move IP protocol setting from transforms into xfrm4_input.c This patch makes the IPv4 x->type->input functions return the next protocol instead of setting it directly. This is identical to how we do things in IPv6 and will help us merge common code on the input path. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ah4.c | 5 +++-- net/ipv4/esp4.c | 3 +-- net/ipv4/ipcomp.c | 7 ++++--- net/ipv4/xfrm4_input.c | 7 ++++++- net/ipv4/xfrm4_tunnel.c | 2 +- 5 files changed, 15 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 60925fedbf16..4e8e3b079f5b 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -125,6 +125,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) { int ah_hlen; int ihl; + int nexthdr; int err = -EINVAL; struct iphdr *iph; struct ip_auth_hdr *ah; @@ -136,6 +137,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) ah = (struct ip_auth_hdr *)skb->data; ahp = x->data; + nexthdr = ah->nexthdr; ah_hlen = (ah->hdrlen + 2) << 2; if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && @@ -182,13 +184,12 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) goto out; } } - ((struct iphdr*)work_buf)->protocol = ah->nexthdr; skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_buf, ihl); skb->transport_header = skb->network_header; __skb_pull(skb, ah_hlen + ihl); - return 0; + return nexthdr; out: return err; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 8377bedf3f66..6b1a31a74cf2 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -257,12 +257,11 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_UNNECESSARY; } - iph->protocol = nexthdr[1]; pskb_trim(skb, skb->len - alen - padlen - 2); __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen); skb_set_transport_header(skb, -ihl); - return 0; + return nexthdr[1]; out: return -EINVAL; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 32b02deca2ec..0bfeb02a5f87 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -75,7 +75,6 @@ out: static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) { int err = -ENOMEM; - struct iphdr *iph; struct ip_comp_hdr *ipch; if (skb_linearize_cow(skb)) @@ -84,12 +83,14 @@ static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; /* Remove ipcomp header and decompress original payload */ - iph = ip_hdr(skb); ipch = (void *)skb->data; - iph->protocol = ipch->nexthdr; skb->transport_header = skb->network_header + sizeof(*ipch); __skb_pull(skb, sizeof(*ipch)); err = ipcomp_decompress(x, skb); + if (err) + goto out; + + err = ipch->nexthdr; out: return err; diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 2fa108245413..e9bbfde19ac3 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -54,12 +54,14 @@ static int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) int xfrm_nr = 0; int decaps = 0; int err = xfrm4_parse_spi(skb, ip_hdr(skb)->protocol, &spi, &seq); + unsigned int nhoff = offsetof(struct iphdr, protocol); if (err != 0) goto drop; do { const struct iphdr *iph = ip_hdr(skb); + int nexthdr; if (xfrm_nr == XFRM_MAX_DEPTH) goto drop; @@ -82,9 +84,12 @@ static int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) if (xfrm_state_check_expire(x)) goto drop_unlock; - if (x->type->input(x, skb)) + nexthdr = x->type->input(x, skb); + if (nexthdr <= 0) goto drop_unlock; + skb_network_header(skb)[nhoff] = nexthdr; + /* only the first xfrm gets the encap type */ encap_type = 0; diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index e1fafc1562d8..1312417608e2 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -18,7 +18,7 @@ static int ipip_output(struct xfrm_state *x, struct sk_buff *skb) static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb) { - return 0; + return IPPROTO_IP; } static int ipip_init_state(struct xfrm_state *x) -- cgit v1.2.3 From 227b60f5102cda4e4ab792b526a59c8cb20cd9f8 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 10 Oct 2007 17:30:46 -0700 Subject: [INET]: local port range robustness Expansion of original idea from Denis V. Lunev Add robustness and locking to the local_port_range sysctl. 1. Enforce that low < high when setting. 2. Use seqlock to ensure atomic update. The locking might seem like overkill, but there are cases where sysadmin might want to change value in the middle of a DoS attack. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 22 +++++++++--- net/ipv4/inet_hashtables.c | 13 ++++--- net/ipv4/sysctl_net_ipv4.c | 75 ++++++++++++++++++++++++++++++++++++++--- net/ipv4/tcp_ipv4.c | 1 - net/ipv4/udp.c | 6 ++-- 5 files changed, 98 insertions(+), 19 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fbe7714f21d0..3cef12835c4b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -33,6 +33,19 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); * This array holds the first and last local port number. */ int sysctl_local_port_range[2] = { 32768, 61000 }; +DEFINE_SEQLOCK(sysctl_port_range_lock); + +void inet_get_local_port_range(int *low, int *high) +{ + unsigned seq; + do { + seq = read_seqbegin(&sysctl_port_range_lock); + + *low = sysctl_local_port_range[0]; + *high = sysctl_local_port_range[1]; + } while (read_seqretry(&sysctl_port_range_lock, seq)); +} +EXPORT_SYMBOL(inet_get_local_port_range); int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb) @@ -77,10 +90,11 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, local_bh_disable(); if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover = net_random() % (high - low) + low; + int remaining, rover, low, high; + + inet_get_local_port_range(&low, &high); + remaining = high - low; + rover = net_random() % remaining + low; do { head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fb662621c54e..fac6398e4367 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -279,19 +279,18 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, int ret; if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; + int i, remaining, low, high, port; static u32 hint; u32 offset = hint + inet_sk_port_offset(sk); struct hlist_node *node; struct inet_timewait_sock *tw = NULL; + inet_get_local_port_range(&low, &high); + remaining = high - low; + local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; + for (i = 1; i <= remaining; i++) { + port = low + (i + offset) % remaining; head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; spin_lock(&head->lock); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 53ef0f4bbdaa..eb286abcf5dc 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -89,6 +90,74 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table, return 1; } +extern seqlock_t sysctl_port_range_lock; +extern int sysctl_local_port_range[2]; + +/* Update system visible IP port range */ +static void set_local_port_range(int range[2]) +{ + write_seqlock(&sysctl_port_range_lock); + sysctl_local_port_range[0] = range[0]; + sysctl_local_port_range[1] = range[1]; + write_sequnlock(&sysctl_port_range_lock); +} + +/* Validate changes from /proc interface. */ +static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + int range[2] = { sysctl_local_port_range[0], + sysctl_local_port_range[1] }; + ctl_table tmp = { + .data = &range, + .maxlen = sizeof(range), + .mode = table->mode, + .extra1 = &ip_local_port_range_min, + .extra2 = &ip_local_port_range_max, + }; + + ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); + + if (write && ret == 0) { + if (range[1] <= range[0]) + ret = -EINVAL; + else + set_local_port_range(range); + } + + return ret; +} + +/* Validate changes from sysctl interface. */ +static int ipv4_sysctl_local_port_range(ctl_table *table, int __user *name, + int nlen, void __user *oldval, + size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + int ret; + int range[2] = { sysctl_local_port_range[0], + sysctl_local_port_range[1] }; + ctl_table tmp = { + .data = &range, + .maxlen = sizeof(range), + .mode = table->mode, + .extra1 = &ip_local_port_range_min, + .extra2 = &ip_local_port_range_max, + }; + + ret = sysctl_intvec(&tmp, name, nlen, oldval, oldlenp, newval, newlen); + if (ret == 0 && newval && newlen) { + if (range[1] <= range[0]) + ret = -EINVAL; + else + set_local_port_range(range); + } + return ret; +} + + static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -427,10 +496,8 @@ ctl_table ipv4_table[] = { .data = &sysctl_local_port_range, .maxlen = sizeof(sysctl_local_port_range), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = ip_local_port_range_min, - .extra2 = ip_local_port_range_max + .proc_handler = &ipv4_local_port_range, + .strategy = &ipv4_sysctl_local_port_range, }, { .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8855e640e958..38cf73a56731 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2470,6 +2470,5 @@ EXPORT_SYMBOL(tcp_v4_syn_recv_sock); EXPORT_SYMBOL(tcp_proc_register); EXPORT_SYMBOL(tcp_proc_unregister); #endif -EXPORT_SYMBOL(sysctl_local_port_range); EXPORT_SYMBOL(sysctl_tcp_low_latency); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ef4d901ee9ad..cb9fc58efb2f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -147,11 +147,11 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, write_lock_bh(&udp_hash_lock); if (!snum) { - int i; - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; + int i, low, high; unsigned rover, best, best_size_so_far; + inet_get_local_port_range(&low, &high); + best_size_so_far = UINT_MAX; best = rover = net_random() % (high - low) + low; -- cgit v1.2.3 From cd40b7d3983c708aabe3d3008ec64ffce56d33b0 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 10 Oct 2007 21:15:29 -0700 Subject: [NET]: make netlink user -> kernel interface synchronious This patch make processing netlink user -> kernel messages synchronious. This change was inspired by the talk with Alexey Kuznetsov about current netlink messages processing. He says that he was badly wrong when introduced asynchronious user -> kernel communication. The call netlink_unicast is the only path to send message to the kernel netlink socket. But, unfortunately, it is also used to send data to the user. Before this change the user message has been attached to the socket queue and sk->sk_data_ready was called. The process has been blocked until all pending messages were processed. The bad thing is that this processing may occur in the arbitrary process context. This patch changes nlk->data_ready callback to get 1 skb and force packet processing right in the netlink_unicast. Kernel -> user path in netlink_unicast remains untouched. EINTR processing for in netlink_run_queue was changed. It forces rtnl_lock drop, but the process remains in the cycle until the message will be fully processed. So, there is no need to use this kludges now. Signed-off-by: Denis V. Lunev Acked-by: Alexey Kuznetsov Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 9 ++++++--- net/ipv4/inet_diag.c | 12 ++++-------- net/ipv4/netfilter/ip_queue.c | 17 ++++------------- 3 files changed, 14 insertions(+), 24 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f823ca34cb12..a5cba2349605 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -62,6 +62,9 @@ static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; #define FIB_TABLE_HASHSZ 256 static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; +static struct sock *fibnl = NULL; + + struct fib_table *fib_new_table(u32 id) { struct fib_table *tb; @@ -811,13 +814,13 @@ static void nl_fib_input(struct sock *sk, int len) pid = NETLINK_CB(skb).pid; /* pid of sending process */ NETLINK_CB(skb).pid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ - netlink_unicast(sk, skb, pid, MSG_DONTWAIT); + netlink_unicast(fibnl, skb, pid, MSG_DONTWAIT); } static void nl_fib_lookup_init(void) { - netlink_kernel_create(&init_net, NETLINK_FIB_LOOKUP, 0, nl_fib_input, - NULL, THIS_MODULE); + fibnl = netlink_kernel_create(&init_net, NETLINK_FIB_LOOKUP, 0, + nl_fib_input, NULL, THIS_MODULE); } static void fib_disable_ip(struct net_device *dev, int force) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index b04a6ee5a9a1..7eb83ebed2ec 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -839,15 +839,11 @@ static int inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) static DEFINE_MUTEX(inet_diag_mutex); -static void inet_diag_rcv(struct sock *sk, int len) +static void inet_diag_rcv(struct sk_buff *skb) { - unsigned int qlen = 0; - - do { - mutex_lock(&inet_diag_mutex); - qlen = netlink_run_queue(sk, qlen, &inet_diag_rcv_msg); - mutex_unlock(&inet_diag_mutex); - } while (qlen); + mutex_lock(&inet_diag_mutex); + netlink_rcv_skb(skb, &inet_diag_rcv_msg); + mutex_unlock(&inet_diag_mutex); } static DEFINE_SPINLOCK(inet_diag_register_lock); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index aaa3f5c56761..23cbfc7c80fd 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -475,7 +475,7 @@ ipq_dev_drop(int ifindex) #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) static inline void -ipq_rcv_skb(struct sk_buff *skb) +__ipq_rcv_skb(struct sk_buff *skb) { int status, type, pid, flags, nlmsglen, skblen; struct nlmsghdr *nlh; @@ -533,19 +533,10 @@ ipq_rcv_skb(struct sk_buff *skb) } static void -ipq_rcv_sk(struct sock *sk, int len) +ipq_rcv_skb(struct sk_buff *skb) { - struct sk_buff *skb; - unsigned int qlen; - mutex_lock(&ipqnl_mutex); - - for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { - skb = skb_dequeue(&sk->sk_receive_queue); - ipq_rcv_skb(skb); - kfree_skb(skb); - } - + __ipq_rcv_skb(skb); mutex_unlock(&ipqnl_mutex); } @@ -670,7 +661,7 @@ static int __init ip_queue_init(void) netlink_register_notifier(&ipq_nl_notifier); ipqnl = netlink_kernel_create(&init_net, NETLINK_FIREWALL, 0, - ipq_rcv_sk, NULL, THIS_MODULE); + ipq_rcv_skb, NULL, THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; -- cgit v1.2.3 From 28f7b0360f46eeb9eeee63d03bb5484918a54837 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 10 Oct 2007 21:32:39 -0700 Subject: [NETLINK]: fib_frontend build fixes 1) fibnl needs to be declared outside of config ifdefs, and also should not be explicitly initialized to NULL 2) nl_fib_input() args are wrong for netlink_kernel_create() input method Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index a5cba2349605..78b514ba1414 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -49,6 +49,8 @@ #define FFprint(a...) printk(KERN_DEBUG a) +static struct sock *fibnl; + #ifndef CONFIG_IP_MULTIPLE_TABLES struct fib_table *ip_fib_local_table; @@ -62,9 +64,6 @@ static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; #define FIB_TABLE_HASHSZ 256 static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ]; -static struct sock *fibnl = NULL; - - struct fib_table *fib_new_table(u32 id) { struct fib_table *tb; @@ -787,17 +786,12 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) } } -static void nl_fib_input(struct sock *sk, int len) +static void nl_fib_input(struct sk_buff *skb) { - struct sk_buff *skb = NULL; - struct nlmsghdr *nlh = NULL; struct fib_result_nl *frn; - u32 pid; + struct nlmsghdr *nlh; struct fib_table *tb; - - skb = skb_dequeue(&sk->sk_receive_queue); - if (skb == NULL) - return; + u32 pid; nlh = nlmsg_hdr(skb); if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || -- cgit v1.2.3 From 16e906812f885cf16d95577dba260db6375ba571 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 11 Oct 2007 17:32:31 -0700 Subject: [TCP]: Add bytes_acked (ABC) clearing to FRTO too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I was reading tcp_enter_loss while looking for Cedric's bug and noticed bytes_acked adjustment is missing from FRTO side. Since bytes_acked will only be used in tcp_cong_avoid, I think it's safe to assume RTO would be spurious. During FRTO cwnd will be not controlled by tcp_cong_avoid and if FRTO calls for conventional recovery, cwnd is adjusted and the result of wrong assumption is cleared from bytes_acked. If RTO was in fact spurious, we did normal ABC already and can continue without any additional adjustments. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 101054cb13e9..e40857e073b3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1687,6 +1687,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; tp->frto_counter = 0; + tp->bytes_acked = 0; tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); @@ -2824,6 +2825,7 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) { tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_cnt = 0; + tp->bytes_acked = 0; TCP_ECN_queue_cwr(tp); tcp_moderate_cwnd(tp); } -- cgit v1.2.3 From 3eec0047d9bdd68fddef6539e77fee99ba2531f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 11 Oct 2007 17:33:11 -0700 Subject: [TCP]: Fix mark_head_lost to ignore R-bit when trying to mark L MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This condition (plain R) can arise at least in recovery that is triggered after tcp_undo_loss. There isn't any reason why they should not be marked as lost, not marking makes in_flight estimator to return too large values. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e40857e073b3..c827285b488e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1987,7 +1987,7 @@ static void tcp_mark_head_lost(struct sock *sk, cnt += tcp_skb_pcount(skb); if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) break; - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); tcp_verify_retransmit_hint(tp, skb); -- cgit v1.2.3 From f6fb128d272eb7b0f1a8be78153a724545e28be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 11 Oct 2007 17:33:55 -0700 Subject: [TCP]: Kill almost unused variable pcount from sacktag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's on the way for future cutting of that function. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c827285b488e..50047045df62 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1314,7 +1314,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ flag |= FLAG_DATA_LOST; tcp_for_write_queue_from(skb, sk) { - int in_sack, pcount; + int in_sack; u8 sacked; if (skb == tcp_send_head(sk)) @@ -1336,9 +1336,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); - pcount = tcp_skb_pcount(skb); - - if (pcount > 1 && !in_sack && + if (tcp_skb_pcount(skb) > 1 && !in_sack && after(TCP_SKB_CB(skb)->end_seq, start_seq)) { unsigned int pkt_len; @@ -1353,10 +1351,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->seq); if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size)) break; - pcount = tcp_skb_pcount(skb); } - fack_count += pcount; + fack_count += tcp_skb_pcount(skb); sacked = TCP_SKB_CB(skb)->sacked; -- cgit v1.2.3 From d193594299064d386a2705928cd61ab2ca3d7cee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 11 Oct 2007 17:34:25 -0700 Subject: [TCP]: Extract tcp_match_queue_to_sack from sacktag code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is necessary for upcoming DSACK bugfix. Reduces sacktag length which is not very sad thing at all... :-) Notice that there's a need to handle out-of-mem at caller's place. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 54 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 19 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 50047045df62..bd18c252dee4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1181,6 +1181,38 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, return dup_sack; } +/* Check if skb is fully within the SACK block. In presence of GSO skbs, + * the incoming SACK may not exactly match but we can find smaller MSS + * aligned portion of it that matches. Therefore we might need to fragment + * which may fail and creates some hassle (caller must handle error case + * returns). + */ +int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, + u32 start_seq, u32 end_seq) +{ + int in_sack, err; + unsigned int pkt_len; + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && + !before(end_seq, TCP_SKB_CB(skb)->end_seq); + + if (tcp_skb_pcount(skb) > 1 && !in_sack && + after(TCP_SKB_CB(skb)->end_seq, start_seq)) { + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); + + if (!in_sack) + pkt_len = start_seq - TCP_SKB_CB(skb)->seq; + else + pkt_len = end_seq - TCP_SKB_CB(skb)->seq; + err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size); + if (err < 0) + return err; + } + + return in_sack; +} + static int tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) { @@ -1333,25 +1365,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; - in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && - !before(end_seq, TCP_SKB_CB(skb)->end_seq); - - if (tcp_skb_pcount(skb) > 1 && !in_sack && - after(TCP_SKB_CB(skb)->end_seq, start_seq)) { - unsigned int pkt_len; - - in_sack = !after(start_seq, - TCP_SKB_CB(skb)->seq); - - if (!in_sack) - pkt_len = (start_seq - - TCP_SKB_CB(skb)->seq); - else - pkt_len = (end_seq - - TCP_SKB_CB(skb)->seq); - if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size)) - break; - } + in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); + if (in_sack < 0) + break; fack_count += tcp_skb_pcount(skb); -- cgit v1.2.3 From 4cd829995b86e0359796780d43d2f210cb5cf021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 11 Oct 2007 17:34:57 -0700 Subject: [TCP]: No need to re-count fackets_out/sacked_out at RTO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both sacked_out and fackets_out are directly known from how parameter. Since fackets_out is accurate, there's no need for recounting (sacked_out was previously unnecessarily counted in the loop anyway). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bd18c252dee4..a06603912137 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1711,18 +1711,23 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tcp_clear_retrans_hints_partial(tp); } -void tcp_clear_retrans(struct tcp_sock *tp) +static void tcp_clear_retrans_partial(struct tcp_sock *tp) { tp->retrans_out = 0; - - tp->fackets_out = 0; - tp->sacked_out = 0; tp->lost_out = 0; tp->undo_marker = 0; tp->undo_retrans = 0; } +void tcp_clear_retrans(struct tcp_sock *tp) +{ + tcp_clear_retrans_partial(tp); + + tp->fackets_out = 0; + tp->sacked_out = 0; +} + /* Enter Loss state. If "how" is not zero, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. @@ -1732,7 +1737,6 @@ void tcp_enter_loss(struct sock *sk, int how) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt = 0; /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || @@ -1746,7 +1750,10 @@ void tcp_enter_loss(struct sock *sk, int how) tp->snd_cwnd_stamp = tcp_time_stamp; tp->bytes_acked = 0; - tcp_clear_retrans(tp); + tcp_clear_retrans_partial(tp); + + if (tcp_is_reno(tp)) + tcp_reset_reno_sack(tp); if (!how) { /* Push undo marker, if it was plain RTO and nothing @@ -1754,13 +1761,15 @@ void tcp_enter_loss(struct sock *sk, int how) tp->undo_marker = tp->snd_una; tcp_clear_retrans_hints_partial(tp); } else { + tp->sacked_out = 0; + tp->fackets_out = 0; tcp_clear_all_retrans_hints(tp); } tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) break; - cnt += tcp_skb_pcount(skb); + if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; @@ -1768,9 +1777,6 @@ void tcp_enter_loss(struct sock *sk, int how) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - } else { - tp->sacked_out += tcp_skb_pcount(skb); - tp->fackets_out = cnt; } } tcp_verify_left_out(tp); -- cgit v1.2.3 From f785a8e28b9d103c7473655743b6ac1bc3cd3a58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 11 Oct 2007 17:35:41 -0700 Subject: [TCP]: Fix lost_retrans loop vs fastpath problems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Detection implemented with lost_retrans must work also when fastpath is taken, yet most of the queue is skipped including (very likely) those retransmitted skb's we're interested in. This problem appeared when the hints got added, which removed a need to always walk over the whole write queue head. Therefore decicion for the lost_retrans worker loop entry must be separated from the sacktag processing more than it was necessary before. It turns out to be problematic to optimize the worker loop very heavily because ack_seqs of skb may have a number of discontinuity points. Maybe similar approach as currently is implemented could be attempted but that's becoming more and more complex because the trend is towards less skb walking in sacktag marker. Trying a simple work until all rexmitted skbs heve been processed approach. Maybe after(highest_sack_end_seq, tp->high_seq) checking is not sufficiently accurate and causes entry too often in no-work-to-do cases. Since that's not known, I've separated solution to that from this patch. Noticed because of report against a related problem from TAKANO Ryousei . He also provided a patch to that part of the problem. This patch includes solution to it (though this patch has to use somewhat different placement). TAKANO's description and patch is available here: http://marc.info/?l=linux-netdev&m=119149311913288&w=2 ...In short, TAKANO's problem is that end_seq the loop is using not necessarily the largest SACK block's end_seq because the current ACK may still have higher SACK blocks which are later by the loop. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a06603912137..d5e0fcc22a3b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1109,27 +1109,34 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, /* Check for lost retransmit. This superb idea is borrowed from "ratehalving". * Event "C". Later note: FACK people cheated me again 8), we have to account * for reordering! Ugly, but should help. + * + * Search retransmitted skbs from write_queue that were sent when snd_nxt was + * less than what is now known to be received by the other end (derived from + * SACK blocks by the caller). */ -static int tcp_mark_lost_retrans(struct sock *sk, u32 lost_retrans) +static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int flag = 0; + int cnt = 0; tcp_for_write_queue(skb, sk) { u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; if (skb == tcp_send_head(sk)) break; - if (after(TCP_SKB_CB(skb)->seq, lost_retrans)) + if (cnt == tp->retrans_out) break; if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) continue; - if ((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) && - after(lost_retrans, ack_seq) && + if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) + continue; + + if (after(received_upto, ack_seq) && (tcp_is_fack(tp) || - !before(lost_retrans, + !before(received_upto, ack_seq + tp->reordering * tp->mss_cache))) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); @@ -1143,6 +1150,8 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 lost_retrans) flag |= FLAG_DATA_SACKED; NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); } + } else { + cnt += tcp_skb_pcount(skb); } } return flag; @@ -1225,7 +1234,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; int reord = tp->packets_out; int prior_fackets; - u32 lost_retrans = 0; + u32 highest_sack_end_seq = 0; int flag = 0; int found_dup_sack = 0; int cached_fack_count; @@ -1396,11 +1405,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ continue; } - if ((sacked&TCPCB_SACKED_RETRANS) && - after(end_seq, TCP_SKB_CB(skb)->ack_seq) && - (!lost_retrans || after(end_seq, lost_retrans))) - lost_retrans = end_seq; - if (!in_sack) continue; @@ -1454,9 +1458,10 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (fack_count > tp->fackets_out) tp->fackets_out = fack_count; - if (after(TCP_SKB_CB(skb)->seq, - tp->highest_sack)) + if (after(TCP_SKB_CB(skb)->seq, tp->highest_sack)) { tp->highest_sack = TCP_SKB_CB(skb)->seq; + highest_sack_end_seq = TCP_SKB_CB(skb)->end_seq; + } } else { if (dup_sack && (sacked&TCPCB_RETRANS)) reord = min(fack_count, reord); @@ -1476,8 +1481,10 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } } - if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) - flag |= tcp_mark_lost_retrans(sk, lost_retrans); + if (tp->retrans_out && highest_sack_end_seq && + after(highest_sack_end_seq, tp->high_seq) && + icsk->icsk_ca_state == TCP_CA_Recovery) + flag |= tcp_mark_lost_retrans(sk, highest_sack_end_seq); tcp_verify_left_out(tp); -- cgit v1.2.3 From b08d6cb22c777c8c91c16d8e3b8aafc93c98cbd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 11 Oct 2007 17:36:13 -0700 Subject: [TCP]: Limit processing lost_retrans loop to work-to-do cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This addition of lost_retrans_low to tcp_sock might be unnecessary, it's not clear how often lost_retrans worker is executed when there wasn't work to do. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 14 +++++++++++--- net/ipv4/tcp_output.c | 2 ++ 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d5e0fcc22a3b..0a42e9340346 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1112,7 +1112,8 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, * * Search retransmitted skbs from write_queue that were sent when snd_nxt was * less than what is now known to be received by the other end (derived from - * SACK blocks by the caller). + * SACK blocks by the caller). Also calculate the lowest snd_nxt among the + * remaining retransmitted skbs to avoid some costly processing per ACKs. */ static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) { @@ -1120,6 +1121,7 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) struct sk_buff *skb; int flag = 0; int cnt = 0; + u32 new_low_seq = 0; tcp_for_write_queue(skb, sk) { u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; @@ -1151,9 +1153,15 @@ static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto) NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); } } else { + if (!new_low_seq || before(ack_seq, new_low_seq)) + new_low_seq = ack_seq; cnt += tcp_skb_pcount(skb); } } + + if (tp->retrans_out) + tp->lost_retrans_low = new_low_seq; + return flag; } @@ -1481,8 +1489,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } } - if (tp->retrans_out && highest_sack_end_seq && - after(highest_sack_end_seq, tp->high_seq) && + if (tp->retrans_out && + after(highest_sack_end_seq, tp->lost_retrans_low) && icsk->icsk_ca_state == TCP_CA_Recovery) flag |= tcp_mark_lost_retrans(sk, highest_sack_end_seq); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 53296753b0bd..324b4207254a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1914,6 +1914,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) printk(KERN_DEBUG "retrans_out leaked.\n"); } #endif + if (!tp->retrans_out) + tp->lost_retrans_low = tp->snd_nxt; TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); -- cgit v1.2.3