summaryrefslogtreecommitdiffstats
path: root/net/dccp
diff options
context:
space:
mode:
Diffstat (limited to 'net/dccp')
-rw-r--r--net/dccp/Makefile4
-rw-r--r--net/dccp/ackvec.c415
-rw-r--r--net/dccp/ackvec.h38
-rw-r--r--net/dccp/ccids/ccid2.c134
-rw-r--r--net/dccp/ccids/ccid2.h2
-rw-r--r--net/dccp/dccp.h21
-rw-r--r--net/dccp/input.c34
-rw-r--r--net/dccp/ipv4.c13
-rw-r--r--net/dccp/options.c43
-rw-r--r--net/dccp/output.c22
-rw-r--r--net/dccp/proto.c71
-rw-r--r--net/dccp/qpolicy.c137
12 files changed, 566 insertions, 368 deletions
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 2991efcc8dea..5c8362b037ed 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,7 +1,7 @@
obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
-dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o
-
+dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \
+ qpolicy.o
#
# CCID algorithms to be used by dccp.ko
#
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index abaf241c7353..25b7a8d1ad58 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -9,18 +9,10 @@
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; version 2 of the License;
*/
-
-#include "ackvec.h"
#include "dccp.h"
-
-#include <linux/init.h>
-#include <linux/errno.h>
#include <linux/kernel.h>
-#include <linux/skbuff.h>
#include <linux/slab.h>
-#include <net/sock.h>
-
static struct kmem_cache *dccp_ackvec_slab;
static struct kmem_cache *dccp_ackvec_record_slab;
@@ -92,6 +84,24 @@ int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
return 0;
}
+static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
+ const u64 ackno)
+{
+ struct dccp_ackvec_record *avr;
+ /*
+ * Exploit that records are inserted in descending order of sequence
+ * number, start with the oldest record first. If @ackno is `before'
+ * the earliest ack_ackno, the packet is too old to be considered.
+ */
+ list_for_each_entry_reverse(avr, av_list, avr_node) {
+ if (avr->avr_ack_seqno == ackno)
+ return avr;
+ if (before48(ackno, avr->avr_ack_seqno))
+ break;
+ }
+ return NULL;
+}
+
/*
* Buffer index and length computation using modulo-buffersize arithmetic.
* Note that, as pointers move from right to left, head is `before' tail.
@@ -113,248 +123,253 @@ u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
}
-/*
- * If several packets are missing, the HC-Receiver may prefer to enter multiple
- * bytes with run length 0, rather than a single byte with a larger run length;
- * this simplifies table updates if one of the missing packets arrives.
+/**
+ * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1
+ * @av: non-empty buffer to update
+ * @distance: negative or zero distance of @seqno from buf_ackno downward
+ * @seqno: the (old) sequence number whose record is to be updated
+ * @state: state in which packet carrying @seqno was received
*/
-static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
- const unsigned int packets,
- const unsigned char state)
+static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
+ u64 seqno, enum dccp_ackvec_states state)
{
- long gap;
- long new_head;
+ u16 ptr = av->av_buf_head;
- if (av->av_vec_len + packets > DCCPAV_MAX_ACKVEC_LEN)
- return -ENOBUFS;
+ BUG_ON(distance > 0);
+ if (unlikely(dccp_ackvec_is_empty(av)))
+ return;
- gap = packets - 1;
- new_head = av->av_buf_head - packets;
+ do {
+ u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
- if (new_head < 0) {
- if (gap > 0) {
- memset(av->av_buf, DCCPAV_NOT_RECEIVED,
- gap + new_head + 1);
- gap = -new_head;
+ if (distance + runlen >= 0) {
+ /*
+ * Only update the state if packet has not been received
+ * yet. This is OK as per the second table in RFC 4340,
+ * 11.4.1; i.e. here we are using the following table:
+ * RECEIVED
+ * 0 1 3
+ * S +---+---+---+
+ * T 0 | 0 | 0 | 0 |
+ * O +---+---+---+
+ * R 1 | 1 | 1 | 1 |
+ * E +---+---+---+
+ * D 3 | 0 | 1 | 3 |
+ * +---+---+---+
+ * The "Not Received" state was set by reserve_seats().
+ */
+ if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
+ av->av_buf[ptr] = state;
+ else
+ dccp_pr_debug("Not changing %llu state to %u\n",
+ (unsigned long long)seqno, state);
+ break;
}
- new_head += DCCPAV_MAX_ACKVEC_LEN;
- }
- av->av_buf_head = new_head;
+ distance += runlen + 1;
+ ptr = __ackvec_idx_add(ptr, 1);
- if (gap > 0)
- memset(av->av_buf + av->av_buf_head + 1,
- DCCPAV_NOT_RECEIVED, gap);
+ } while (ptr != av->av_buf_tail);
+}
- av->av_buf[av->av_buf_head] = state;
- av->av_vec_len += packets;
- return 0;
+/* Mark @num entries after buf_head as "Not yet received". */
+static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
+{
+ u16 start = __ackvec_idx_add(av->av_buf_head, 1),
+ len = DCCPAV_MAX_ACKVEC_LEN - start;
+
+ /* check for buffer wrap-around */
+ if (num > len) {
+ memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
+ start = 0;
+ num -= len;
+ }
+ if (num)
+ memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
}
-/*
- * Implements the RFC 4340, Appendix A
+/**
+ * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer
+ * @av: container of buffer to update (can be empty or non-empty)
+ * @num_packets: number of packets to register (must be >= 1)
+ * @seqno: sequence number of the first packet in @num_packets
+ * @state: state in which packet carrying @seqno was received
*/
-int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
- const u64 ackno, const u8 state)
+static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
+ u64 seqno, enum dccp_ackvec_states state)
{
- u8 *cur_head = av->av_buf + av->av_buf_head,
- *buf_end = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
- /*
- * Check at the right places if the buffer is full, if it is, tell the
- * caller to start dropping packets till the HC-Sender acks our ACK
- * vectors, when we will free up space in av_buf.
- *
- * We may well decide to do buffer compression, etc, but for now lets
- * just drop.
- *
- * From Appendix A.1.1 (`New Packets'):
- *
- * Of course, the circular buffer may overflow, either when the
- * HC-Sender is sending data at a very high rate, when the
- * HC-Receiver's acknowledgements are not reaching the HC-Sender,
- * or when the HC-Sender is forgetting to acknowledge those acks
- * (so the HC-Receiver is unable to clean up old state). In this
- * case, the HC-Receiver should either compress the buffer (by
- * increasing run lengths when possible), transfer its state to
- * a larger buffer, or, as a last resort, drop all received
- * packets, without processing them whatsoever, until its buffer
- * shrinks again.
- */
+ u32 num_cells = num_packets;
- /* See if this is the first ackno being inserted */
- if (av->av_vec_len == 0) {
- *cur_head = state;
- av->av_vec_len = 1;
- } else if (after48(ackno, av->av_buf_ackno)) {
- const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno);
+ if (num_packets > DCCPAV_BURST_THRESH) {
+ u32 lost_packets = num_packets - 1;
+ DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
/*
- * Look if the state of this packet is the same as the
- * previous ackno and if so if we can bump the head len.
+ * We received 1 packet and have a loss of size "num_packets-1"
+ * which we squeeze into num_cells-1 rather than reserving an
+ * entire byte for each lost packet.
+ * The reason is that the vector grows in O(burst_length); when
+ * it grows too large there will no room left for the payload.
+ * This is a trade-off: if a few packets out of the burst show
+ * up later, their state will not be changed; it is simply too
+ * costly to reshuffle/reallocate/copy the buffer each time.
+ * Should such problems persist, we will need to switch to a
+ * different underlying data structure.
*/
- if (delta == 1 && dccp_ackvec_state(cur_head) == state &&
- dccp_ackvec_runlen(cur_head) < DCCPAV_MAX_RUNLEN)
- *cur_head += 1;
- else if (dccp_ackvec_set_buf_head_state(av, delta, state))
- return -ENOBUFS;
- } else {
- /*
- * A.1.2. Old Packets
- *
- * When a packet with Sequence Number S <= buf_ackno
- * arrives, the HC-Receiver will scan the table for
- * the byte corresponding to S. (Indexing structures
- * could reduce the complexity of this scan.)
- */
- u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno);
+ for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
+ u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN);
- while (1) {
- const u8 len = dccp_ackvec_runlen(cur_head);
- /*
- * valid packets not yet in av_buf have a reserved
- * entry, with a len equal to 0.
- */
- if (*cur_head == DCCPAV_NOT_RECEIVED && delta == 0) {
- dccp_pr_debug("Found %llu reserved seat!\n",
- (unsigned long long)ackno);
- *cur_head = state;
- goto out;
- }
- /* len == 0 means one packet */
- if (delta < len + 1)
- goto out_duplicate;
-
- delta -= len + 1;
- if (++cur_head == buf_end)
- cur_head = av->av_buf;
+ av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
+ av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
+
+ lost_packets -= len;
}
}
- av->av_buf_ackno = ackno;
-out:
- return 0;
+ if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
+ DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
+ av->av_overflow = true;
+ }
+
+ av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
+ if (av->av_overflow)
+ av->av_buf_tail = av->av_buf_head;
+
+ av->av_buf[av->av_buf_head] = state;
+ av->av_buf_ackno = seqno;
-out_duplicate:
- /* Duplicate packet */
- dccp_pr_debug("Received a dup or already considered lost "
- "packet: %llu\n", (unsigned long long)ackno);
- return -EILSEQ;
+ if (num_packets > 1)
+ dccp_ackvec_reserve_seats(av, num_packets - 1);
}
-static void dccp_ackvec_throw_record(struct dccp_ackvec *av,
- struct dccp_ackvec_record *avr)
+/**
+ * dccp_ackvec_input - Register incoming packet in the buffer
+ */
+void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
{
- struct dccp_ackvec_record *next;
+ u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+ enum dccp_ackvec_states state = DCCPAV_RECEIVED;
- /* sort out vector length */
- if (av->av_buf_head <= avr->avr_ack_ptr)
- av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head;
- else
- av->av_vec_len = DCCPAV_MAX_ACKVEC_LEN - 1 -
- av->av_buf_head + avr->avr_ack_ptr;
+ if (dccp_ackvec_is_empty(av)) {
+ dccp_ackvec_add_new(av, 1, seqno, state);
+ av->av_tail_ackno = seqno;
- /* free records */
- list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
- list_del(&avr->avr_node);
- kmem_cache_free(dccp_ackvec_record_slab, avr);
- }
-}
+ } else {
+ s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
+ u8 *current_head = av->av_buf + av->av_buf_head;
-void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
- const u64 ackno)
-{
- struct dccp_ackvec_record *avr;
+ if (num_packets == 1 &&
+ dccp_ackvec_state(current_head) == state &&
+ dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
- /*
- * If we traverse backwards, it should be faster when we have large
- * windows. We will be receiving ACKs for stuff we sent a while back
- * -sorbo.
- */
- list_for_each_entry_reverse(avr, &av->av_records, avr_node) {
- if (ackno == avr->avr_ack_seqno) {
- dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, "
- "ack_ackno=%llu, ACKED!\n",
- dccp_role(sk), avr->avr_ack_runlen,
- (unsigned long long)avr->avr_ack_seqno,
- (unsigned long long)avr->avr_ack_ackno);
- dccp_ackvec_throw_record(av, avr);
- break;
- } else if (avr->avr_ack_seqno > ackno)
- break; /* old news */
+ *current_head += 1;
+ av->av_buf_ackno = seqno;
+
+ } else if (num_packets > 0) {
+ dccp_ackvec_add_new(av, num_packets, seqno, state);
+ } else {
+ dccp_ackvec_update_old(av, num_packets, seqno, state);
+ }
}
}
-static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
- struct sock *sk, u64 *ackno,
- const unsigned char len,
- const unsigned char *vector)
+/**
+ * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection
+ * This routine is called when the peer acknowledges the receipt of Ack Vectors
+ * up to and including @ackno. While based on on section A.3 of RFC 4340, here
+ * are additional precautions to prevent corrupted buffer state. In particular,
+ * we use tail_ackno to identify outdated records; it always marks the earliest
+ * packet of group (2) in 11.4.2.
+ */
+void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
{
- unsigned char i;
- struct dccp_ackvec_record *avr;
+ struct dccp_ackvec_record *avr, *next;
+ u8 runlen_now, eff_runlen;
+ s64 delta;
- /* Check if we actually sent an ACK vector */
- if (list_empty(&av->av_records))
+ avr = dccp_ackvec_lookup(&av->av_records, ackno);
+ if (avr == NULL)
return;
+ /*
+ * Deal with outdated acknowledgments: this arises when e.g. there are
+ * several old records and the acks from the peer come in slowly. In
+ * that case we may still have records that pre-date tail_ackno.
+ */
+ delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
+ if (delta < 0)
+ goto free_records;
+ /*
+ * Deal with overlapping Ack Vectors: don't subtract more than the
+ * number of packets between tail_ackno and ack_ackno.
+ */
+ eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
- i = len;
+ runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
/*
- * XXX
- * I think it might be more efficient to work backwards. See comment on
- * rcv_ackno. -sorbo.
+ * The run length of Ack Vector cells does not decrease over time. If
+ * the run length is the same as at the time the Ack Vector was sent, we
+ * free the ack_ptr cell. That cell can however not be freed if the run
+ * length has increased: in this case we need to move the tail pointer
+ * backwards (towards higher indices), to its next-oldest neighbour.
*/
- avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node);
- while (i--) {
- const u8 rl = dccp_ackvec_runlen(vector);
- u64 ackno_end_rl;
+ if (runlen_now > eff_runlen) {
- dccp_set_seqno(&ackno_end_rl, *ackno - rl);
+ av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
+ av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
+ /* This move may not have cleared the overflow flag. */
+ if (av->av_overflow)
+ av->av_overflow = (av->av_buf_head == av->av_buf_tail);
+ } else {
+ av->av_buf_tail = avr->avr_ack_ptr;
/*
- * If our AVR sequence number is greater than the ack, go
- * forward in the AVR list until it is not so.
+ * We have made sure that avr points to a valid cell within the
+ * buffer. This cell is either older than head, or equals head
+ * (empty buffer): in both cases we no longer have any overflow.
*/
- list_for_each_entry_from(avr, &av->av_records, avr_node) {
- if (!after48(avr->avr_ack_seqno, *ackno))
- goto found;
- }
- /* End of the av_records list, not found, exit */
- break;
-found:
- if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) {
- if (dccp_ackvec_state(vector) != DCCPAV_NOT_RECEIVED) {
- dccp_pr_debug("%s ACK vector 0, len=%d, "
- "ack_seqno=%llu, ack_ackno=%llu, "
- "ACKED!\n",
- dccp_role(sk), len,
- (unsigned long long)
- avr->avr_ack_seqno,
- (unsigned long long)
- avr->avr_ack_ackno);
- dccp_ackvec_throw_record(av, avr);
- break;
- }
- /*
- * If it wasn't received, continue scanning... we might
- * find another one.
- */
- }
+ av->av_overflow = 0;
+ }
- dccp_set_seqno(ackno, ackno_end_rl - 1);
- ++vector;
+ /*
+ * The peer has acknowledged up to and including ack_ackno. Hence the
+ * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
+ */
+ av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
+
+free_records:
+ list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
+ list_del(&avr->avr_node);
+ kmem_cache_free(dccp_ackvec_record_slab, avr);
}
}
-int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
- u64 *ackno, const u8 opt, const u8 *value, const u8 len)
+/*
+ * Routines to keep track of Ack Vectors received in an skb
+ */
+int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
{
- if (len > DCCP_SINGLE_OPT_MAXLEN)
- return -1;
+ struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
+
+ if (new == NULL)
+ return -ENOBUFS;
+ new->vec = vec;
+ new->len = len;
+ new->nonce = nonce;
- /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */
- dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk,
- ackno, len, value);
+ list_add_tail(&new->node, head);
return 0;
}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
+
+void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
+{
+ struct dccp_ackvec_parsed *cur, *next;
+
+ list_for_each_entry_safe(cur, next, parsed_chunks, node)
+ kfree(cur);
+ INIT_LIST_HEAD(parsed_chunks);
+}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
int __init dccp_ackvec_init(void)
{
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 23880be8fc29..e2ab0627a5ff 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -29,6 +29,9 @@
/* Estimated minimum average Ack Vector length - used for updating MPS */
#define DCCPAV_MIN_OPTLEN 16
+/* Threshold for coping with large bursts of losses */
+#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8)
+
enum dccp_ackvec_states {
DCCPAV_RECEIVED = 0x00,
DCCPAV_ECN_MARKED = 0x40,
@@ -61,7 +64,6 @@ static inline u8 dccp_ackvec_state(const u8 *cell)
* %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
* @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound
* @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously)
- * @av_veclen: length of the live portion of @av_buf
*/
struct dccp_ackvec {
u8 av_buf[DCCPAV_MAX_ACKVEC_LEN];
@@ -72,7 +74,6 @@ struct dccp_ackvec {
bool av_buf_nonce[DCCPAV_NUM_ACKVECS];
u8 av_overflow:1;
struct list_head av_records;
- u16 av_vec_len;
};
/** struct dccp_ackvec_record - Records information about sent Ack Vectors
@@ -98,29 +99,38 @@ struct dccp_ackvec_record {
u8 avr_ack_nonce:1;
};
-struct sock;
-struct sk_buff;
-
extern int dccp_ackvec_init(void);
extern void dccp_ackvec_exit(void);
extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
extern void dccp_ackvec_free(struct dccp_ackvec *av);
-extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
- const u64 ackno, const u8 state);
-
-extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
- struct sock *sk, const u64 ackno);
-extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
- u64 *ackno, const u8 opt,
- const u8 *value, const u8 len);
-
+extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
+extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av);
static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
{
return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
}
+
+/**
+ * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb
+ * @vec: start of vector (offset into skb)
+ * @len: length of @vec
+ * @nonce: whether @vec had an ECN nonce of 0 or 1
+ * @node: FIFO - arranged in descending order of ack_ackno
+ * This structure is used by CCIDs to access Ack Vectors in a received skb.
+ */
+struct dccp_ackvec_parsed {
+ u8 *vec,
+ len,
+ nonce:1;
+ struct list_head node;
+};
+
+extern int dccp_ackvec_parsed_add(struct list_head *head,
+ u8 *vec, u8 len, u8 nonce);
+extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index cb1b4a0d1877..e96d5e810039 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -246,68 +246,6 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
#endif
}
-/* XXX Lame code duplication!
- * returns -1 if none was found.
- * else returns the next offset to use in the function call.
- */
-static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset,
- unsigned char **vec, unsigned char *veclen)
-{
- const struct dccp_hdr *dh = dccp_hdr(skb);
- unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
- unsigned char *opt_ptr;
- const unsigned char *opt_end = (unsigned char *)dh +
- (dh->dccph_doff * 4);
- unsigned char opt, len;
- unsigned char *value;
-
- BUG_ON(offset < 0);
- options += offset;
- opt_ptr = options;
- if (opt_ptr >= opt_end)
- return -1;
-
- while (opt_ptr != opt_end) {
- opt = *opt_ptr++;
- len = 0;
- value = NULL;
-
- /* Check if this isn't a single byte option */
- if (opt > DCCPO_MAX_RESERVED) {
- if (opt_ptr == opt_end)
- goto out_invalid_option;
-
- len = *opt_ptr++;
- if (len < 3)
- goto out_invalid_option;
- /*
- * Remove the type and len fields, leaving
- * just the value size
- */
- len -= 2;
- value = opt_ptr;
- opt_ptr += len;
-
- if (opt_ptr > opt_end)
- goto out_invalid_option;
- }
-
- switch (opt) {
- case DCCPO_ACK_VECTOR_0:
- case DCCPO_ACK_VECTOR_1:
- *vec = value;
- *veclen = len;
- return offset + (opt_ptr - options);
- }
- }
-
- return -1;
-
-out_invalid_option:
- DCCP_BUG("Invalid option - this should not happen (previous parsing)!");
- return -1;
-}
-
/**
* ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
* This code is almost identical with TCP's tcp_rtt_estimator(), since
@@ -432,16 +370,28 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
ccid2_change_l_ack_ratio(sk, hc->tx_cwnd);
}
+static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+ u8 option, u8 *optval, u8 optlen)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+
+ switch (option) {
+ case DCCPO_ACK_VECTOR_0:
+ case DCCPO_ACK_VECTOR_1:
+ return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen,
+ option - DCCPO_ACK_VECTOR_0);
+ }
+ return 0;
+}
+
static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
+ struct dccp_ackvec_parsed *avp;
u64 ackno, seqno;
struct ccid2_seq *seqp;
- unsigned char *vector;
- unsigned char veclen;
- int offset = 0;
int done = 0;
unsigned int maxincr = 0;
@@ -475,17 +425,12 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
}
/* check forward path congestion */
- /* still didn't send out new data packets */
- if (hc->tx_seqh == hc->tx_seqt)
+ if (dccp_packet_without_ack(skb))
return;
- switch (DCCP_SKB_CB(skb)->dccpd_type) {
- case DCCP_PKT_ACK:
- case DCCP_PKT_DATAACK:
- break;
- default:
- return;
- }
+ /* still didn't send out new data packets */
+ if (hc->tx_seqh == hc->tx_seqt)
+ goto done;
ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
if (after48(ackno, hc->tx_high_ack))
@@ -509,15 +454,16 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
/* go through all ack vectors */
- while ((offset = ccid2_ackvector(sk, skb, offset,
- &vector, &veclen)) != -1) {
+ list_for_each_entry(avp, &hc->tx_av_chunks, node) {
/* go through this ack vector */
- while (veclen--) {
- u64 ackno_end_rl = SUB48(ackno, dccp_ackvec_runlen(vector));
+ for (; avp->len--; avp->vec++) {
+ u64 ackno_end_rl = SUB48(ackno,
+ dccp_ackvec_runlen(avp->vec));
- ccid2_pr_debug("ackvec start:%llu end:%llu\n",
+ ccid2_pr_debug("ackvec %llu |%u,%u|\n",
(unsigned long long)ackno,
- (unsigned long long)ackno_end_rl);
+ dccp_ackvec_state(avp->vec) >> 6,
+ dccp_ackvec_runlen(avp->vec));
/* if the seqno we are analyzing is larger than the
* current ackno, then move towards the tail of our
* seqnos.
@@ -536,7 +482,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
* run length
*/
while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
- const u8 state = dccp_ackvec_state(vector);
+ const u8 state = dccp_ackvec_state(avp->vec);
/* new packet received or marked */
if (state != DCCPAV_NOT_RECEIVED &&
@@ -563,7 +509,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
break;
ackno = SUB48(ackno_end_rl, 1);
- vector++;
}
if (done)
break;
@@ -631,10 +576,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
sk_stop_timer(sk, &hc->tx_rtotimer);
else
sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
-
+done:
/* check if incoming Acks allow pending packets to be sent */
if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+ dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
}
static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
@@ -663,6 +609,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
hc->tx_last_cong = ccid2_time_stamp;
setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
(unsigned long)sk);
+ INIT_LIST_HEAD(&hc->tx_av_chunks);
return 0;
}
@@ -696,16 +643,17 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
}
struct ccid_operations ccid2_ops = {
- .ccid_id = DCCPC_CCID2,
- .ccid_name = "TCP-like",
- .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
- .ccid_hc_tx_init = ccid2_hc_tx_init,
- .ccid_hc_tx_exit = ccid2_hc_tx_exit,
- .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
- .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
- .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
- .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
- .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
+ .ccid_id = DCCPC_CCID2,
+ .ccid_name = "TCP-like",
+ .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
+ .ccid_hc_tx_init = ccid2_hc_tx_init,
+ .ccid_hc_tx_exit = ccid2_hc_tx_exit,
+ .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
+ .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
+ .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
+ .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
+ .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
+ .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
};
#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
index 25cb6b216eda..e9985dafc2c7 100644
--- a/net/dccp/ccids/ccid2.h
+++ b/net/dccp/ccids/ccid2.h
@@ -55,6 +55,7 @@ struct ccid2_seq {
* @tx_rtt_seq: to decay RTTVAR at most once per flight
* @tx_rpseq: last consecutive seqno
* @tx_rpdupack: dupacks since rpseq
+ * @tx_av_chunks: list of Ack Vectors received on current skb
*/
struct ccid2_hc_tx_sock {
u32 tx_cwnd;
@@ -79,6 +80,7 @@ struct ccid2_hc_tx_sock {
int tx_rpdupack;
u32 tx_last_cong;
u64 tx_high_ack;
+ struct list_head tx_av_chunks;
};
static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc)
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 19fafd597465..45087052d894 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -93,9 +93,6 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
-/* Maximal interval between probes for local resources. */
-#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U))
-
/* sysctl variables for DCCP */
extern int sysctl_dccp_request_retries;
extern int sysctl_dccp_retries1;
@@ -203,12 +200,7 @@ struct dccp_mib {
DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field)
-#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field)
#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
-#define DCCP_ADD_STATS_BH(field, val) \
- SNMP_ADD_STATS_BH(dccp_statistics, field, val)
-#define DCCP_ADD_STATS_USER(field, val) \
- SNMP_ADD_STATS_USER(dccp_statistics, field, val)
/*
* Checksumming routines
@@ -243,6 +235,19 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
extern void dccp_send_sync(struct sock *sk, const u64 seq,
const enum dccp_pkt_type pkt_type);
+/*
+ * TX Packet Dequeueing Interface
+ */
+extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
+extern bool dccp_qpolicy_full(struct sock *sk);
+extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
+extern struct sk_buff *dccp_qpolicy_top(struct sock *sk);
+extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk);
+extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
+
+/*
+ * TX Packet Output and TX Timers
+ */
extern void dccp_write_xmit(struct sock *sk);
extern void dccp_write_space(struct sock *sk);
extern void dccp_flush_write_queue(struct sock *sk, long *time_budget);
diff --git a/net/dccp/input.c b/net/dccp/input.c
index c7aeeba859d4..15af247ea007 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -160,13 +160,15 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
}
-static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
+static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
{
- struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
- if (dp->dccps_hc_rx_ackvec != NULL)
- dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ if (av == NULL)
+ return;
+ if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ dccp_ackvec_input(av, skb);
}
static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
@@ -239,7 +241,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
dccp_update_gsr(sk, seqno);
if (dh->dccph_type != DCCP_PKT_SYNC &&
- (ackno != DCCP_PKT_WITHOUT_ACK_SEQ))
+ ackno != DCCP_PKT_WITHOUT_ACK_SEQ &&
+ after48(ackno, dp->dccps_gar))
dp->dccps_gar = ackno;
} else {
unsigned long now = jiffies;
@@ -365,21 +368,13 @@ discard:
int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct dccp_hdr *dh, const unsigned len)
{
- struct dccp_sock *dp = dccp_sk(sk);
-
if (dccp_check_seqno(sk, skb))
goto discard;
if (dccp_parse_options(sk, NULL, skb))
return 1;
- if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_event_ack_recv(sk, skb);
-
- if (dp->dccps_hc_rx_ackvec != NULL &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED))
- goto discard;
+ dccp_handle_ackvec_processing(sk, skb);
dccp_deliver_input_to_ccids(sk, skb);
return __dccp_rcv_established(sk, skb, dh, len);
@@ -631,14 +626,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
if (dccp_parse_options(sk, NULL, skb))
return 1;
- if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_event_ack_recv(sk, skb);
-
- if (dp->dccps_hc_rx_ackvec != NULL &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED))
- goto discard;
-
+ dccp_handle_ackvec_processing(sk, skb);
dccp_deliver_input_to_ccids(sk, skb);
}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 3f69ea114829..45a434f94169 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -462,15 +462,12 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
{
struct rtable *rt;
struct flowi fl = { .oif = skb_rtable(skb)->rt_iif,
- .nl_u = { .ip4_u =
- { .daddr = ip_hdr(skb)->saddr,
- .saddr = ip_hdr(skb)->daddr,
- .tos = RT_CONN_FLAGS(sk) } },
+ .fl4_dst = ip_hdr(skb)->saddr,
+ .fl4_src = ip_hdr(skb)->daddr,
+ .fl4_tos = RT_CONN_FLAGS(sk),
.proto = sk->sk_protocol,
- .uli_u = { .ports =
- { .sport = dccp_hdr(skb)->dccph_dport,
- .dport = dccp_hdr(skb)->dccph_sport }
- }
+ .fl_ip_sport = dccp_hdr(skb)->dccph_dport,
+ .fl_ip_dport = dccp_hdr(skb)->dccph_sport
};
security_skb_classify_flow(skb, &fl);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 5adeeed5e0d2..f06ffcfc8d71 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -54,7 +54,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
struct dccp_sock *dp = dccp_sk(sk);
const struct dccp_hdr *dh = dccp_hdr(skb);
const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
- u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
unsigned char *opt_ptr = options;
const unsigned char *opt_end = (unsigned char *)dh +
@@ -129,14 +128,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
if (rc)
goto out_featneg_failed;
break;
- case DCCPO_ACK_VECTOR_0:
- case DCCPO_ACK_VECTOR_1:
- if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
- break;
- if (dp->dccps_hc_rx_ackvec != NULL &&
- dccp_ackvec_parse(sk, skb, &ackno, opt, value, len))
- goto out_invalid_option;
- break;
case DCCPO_TIMESTAMP:
if (len != 4)
goto out_invalid_option;
@@ -226,6 +217,16 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
pkt_type, opt, value, len))
goto out_invalid_option;
break;
+ case DCCPO_ACK_VECTOR_0:
+ case DCCPO_ACK_VECTOR_1:
+ if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
+ break;
+ /*
+ * Ack vectors are processed by the TX CCID if it is
+ * interested. The RX CCID need not parse Ack Vectors,
+ * since it is only interested in clearing old state.
+ * Fall through.
+ */
case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
pkt_type, opt, value, len))
@@ -429,6 +430,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
const u16 buflen = dccp_ackvec_buflen(av);
/* Figure out how many options do we need to represent the ackvec */
const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
@@ -437,10 +439,25 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
const unsigned char *tail, *from;
unsigned char *to;
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+ if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+ DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
+ dccp_packet_name(dcb->dccpd_type));
return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+ }
+ /*
+ * Since Ack Vectors are variable-length, we can not always predict
+ * their size. To catch exception cases where the space is running out
+ * on the skb, a separate Sync is scheduled to carry the Ack Vector.
+ */
+ if (len > DCCPAV_MIN_OPTLEN &&
+ len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
+ DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
+ "MPS=%u ==> reduce payload size?\n", len, skb->len,
+ dcb->dccpd_opt_len, dp->dccps_mss_cache);
+ dp->dccps_sync_scheduled = 1;
+ return 0;
+ }
+ dcb->dccpd_opt_len += len;
to = skb_push(skb, len);
len = buflen;
@@ -481,7 +498,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
/*
* Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
*/
- if (dccp_ackvec_update_records(av, DCCP_SKB_CB(skb)->dccpd_seq, nonce))
+ if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
return -ENOBUFS;
return 0;
}
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 45b91853f5ae..784d30210543 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -242,7 +242,7 @@ static void dccp_xmit_packet(struct sock *sk)
{
int err, len;
struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue);
+ struct sk_buff *skb = dccp_qpolicy_pop(sk);
if (unlikely(skb == NULL))
return;
@@ -283,6 +283,15 @@ static void dccp_xmit_packet(struct sock *sk)
* any local drop will eventually be reported via receiver feedback.
*/
ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
+
+ /*
+ * If the CCID needs to transfer additional header options out-of-band
+ * (e.g. Ack Vectors or feature-negotiation options), it activates this
+ * flag to schedule a Sync. The Sync will automatically incorporate all
+ * currently pending header options, thus clearing the backlog.
+ */
+ if (dp->dccps_sync_scheduled)
+ dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
}
/**
@@ -336,7 +345,7 @@ void dccp_write_xmit(struct sock *sk)
struct dccp_sock *dp = dccp_sk(sk);
struct sk_buff *skb;
- while ((skb = skb_peek(&sk->sk_write_queue))) {
+ while ((skb = dccp_qpolicy_top(sk))) {
int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
switch (ccid_packet_dequeue_eval(rc)) {
@@ -350,8 +359,7 @@ void dccp_write_xmit(struct sock *sk)
dccp_xmit_packet(sk);
break;
case CCID_PACKET_ERR:
- skb_dequeue(&sk->sk_write_queue);
- kfree_skb(skb);
+ dccp_qpolicy_drop(sk, skb);
dccp_pr_debug("packet discarded due to err=%d\n", rc);
}
}
@@ -636,6 +644,12 @@ void dccp_send_sync(struct sock *sk, const u64 ackno,
DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
+ /*
+ * Clear the flag in case the Sync was scheduled for out-of-band data,
+ * such as carrying a long Ack Vector.
+ */
+ dccp_sk(sk)->dccps_sync_scheduled = 0;
+
dccp_transmit_skb(sk, skb);
}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index ef343d53fcea..152975d942d9 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -185,6 +185,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
dp->dccps_role = DCCP_ROLE_UNDEFINED;
dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1;
+ dp->dccps_tx_qlen = sysctl_dccp_tx_qlen;
dccp_init_xmit_timers(sk);
@@ -532,6 +533,20 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
case DCCP_SOCKOPT_RECV_CSCOV:
err = dccp_setsockopt_cscov(sk, val, true);
break;
+ case DCCP_SOCKOPT_QPOLICY_ID:
+ if (sk->sk_state != DCCP_CLOSED)
+ err = -EISCONN;
+ else if (val < 0 || val >= DCCPQ_POLICY_MAX)
+ err = -EINVAL;
+ else
+ dp->dccps_qpolicy = val;
+ break;
+ case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+ if (val < 0)
+ err = -EINVAL;
+ else
+ dp->dccps_tx_qlen = val;
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -639,6 +654,12 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
case DCCP_SOCKOPT_RECV_CSCOV:
val = dp->dccps_pcrlen;
break;
+ case DCCP_SOCKOPT_QPOLICY_ID:
+ val = dp->dccps_qpolicy;
+ break;
+ case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+ val = dp->dccps_tx_qlen;
+ break;
case 128 ... 191:
return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
len, (u32 __user *)optval, optlen);
@@ -681,6 +702,47 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
#endif
+static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
+
+ /*
+ * Assign an (opaque) qpolicy priority value to skb->priority.
+ *
+ * We are overloading this skb field for use with the qpolicy subystem.
+ * The skb->priority is normally used for the SO_PRIORITY option, which
+ * is initialised from sk_priority. Since the assignment of sk_priority
+ * to skb->priority happens later (on layer 3), we overload this field
+ * for use with queueing priorities as long as the skb is on layer 4.
+ * The default priority value (if nothing is set) is 0.
+ */
+ skb->priority = 0;
+
+ for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_DCCP)
+ continue;
+
+ if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
+ !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
+ return -EINVAL;
+
+ switch (cmsg->cmsg_type) {
+ case DCCP_SCM_PRIORITY:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
+ return -EINVAL;
+ skb->priority = *(__u32 *)CMSG_DATA(cmsg);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len)
{
@@ -696,8 +758,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
lock_sock(sk);
- if (sysctl_dccp_tx_qlen &&
- (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) {
+ if (dccp_qpolicy_full(sk)) {
rc = -EAGAIN;
goto out_release;
}
@@ -725,7 +786,11 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (rc != 0)
goto out_discard;
- skb_queue_tail(&sk->sk_write_queue, skb);
+ rc = dccp_msghdr_parse(msg, skb);
+ if (rc != 0)
+ goto out_discard;
+
+ dccp_qpolicy_push(sk, skb);
/*
* The xmit_timer is set if the TX CCID is rate-based and will expire
* when congestion control permits to release further packets into the
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
new file mode 100644
index 000000000000..63c30bfa4703
--- /dev/null
+++ b/net/dccp/qpolicy.c
@@ -0,0 +1,137 @@
+/*
+ * net/dccp/qpolicy.c
+ *
+ * Policy-based packet dequeueing interface for DCCP.
+ *
+ * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License v2
+ * as published by the Free Software Foundation.
+ */
+#include "dccp.h"
+
+/*
+ * Simple Dequeueing Policy:
+ * If tx_qlen is different from 0, enqueue up to tx_qlen elements.
+ */
+static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
+{
+ skb_queue_tail(&sk->sk_write_queue, skb);
+}
+
+static bool qpolicy_simple_full(struct sock *sk)
+{
+ return dccp_sk(sk)->dccps_tx_qlen &&
+ sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
+}
+
+static struct sk_buff *qpolicy_simple_top(struct sock *sk)
+{
+ return skb_peek(&sk->sk_write_queue);
+}
+
+/*
+ * Priority-based Dequeueing Policy:
+ * If tx_qlen is different from 0 and the queue has reached its upper bound
+ * of tx_qlen elements, replace older packets lowest-priority-first.
+ */
+static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
+{
+ struct sk_buff *skb, *best = NULL;
+
+ skb_queue_walk(&sk->sk_write_queue, skb)
+ if (best == NULL || skb->priority > best->priority)
+ best = skb;
+ return best;
+}
+
+static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
+{
+ struct sk_buff *skb, *worst = NULL;
+
+ skb_queue_walk(&sk->sk_write_queue, skb)
+ if (worst == NULL || skb->priority < worst->priority)
+ worst = skb;
+ return worst;
+}
+
+static bool qpolicy_prio_full(struct sock *sk)
+{
+ if (qpolicy_simple_full(sk))
+ dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
+ return false;
+}
+
+/**
+ * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface
+ * @push: add a new @skb to the write queue
+ * @full: indicates that no more packets will be admitted
+ * @top: peeks at whatever the queueing policy defines as its `top'
+ */
+static struct dccp_qpolicy_operations {
+ void (*push) (struct sock *sk, struct sk_buff *skb);
+ bool (*full) (struct sock *sk);
+ struct sk_buff* (*top) (struct sock *sk);
+ __be32 params;
+
+} qpol_table[DCCPQ_POLICY_MAX] = {
+ [DCCPQ_POLICY_SIMPLE] = {
+ .push = qpolicy_simple_push,
+ .full = qpolicy_simple_full,
+ .top = qpolicy_simple_top,
+ .params = 0,
+ },
+ [DCCPQ_POLICY_PRIO] = {
+ .push = qpolicy_simple_push,
+ .full = qpolicy_prio_full,
+ .top = qpolicy_prio_best_skb,
+ .params = DCCP_SCM_PRIORITY,
+ },
+};
+
+/*
+ * Externally visible interface
+ */
+void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
+{
+ qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
+}
+
+bool dccp_qpolicy_full(struct sock *sk)
+{
+ return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
+}
+
+void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
+{
+ if (skb != NULL) {
+ skb_unlink(skb, &sk->sk_write_queue);
+ kfree_skb(skb);
+ }
+}
+
+struct sk_buff *dccp_qpolicy_top(struct sock *sk)
+{
+ return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
+}
+
+struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
+{
+ struct sk_buff *skb = dccp_qpolicy_top(sk);
+
+ if (skb != NULL) {
+ /* Clear any skb fields that we used internally */
+ skb->priority = 0;
+ skb_unlink(skb, &sk->sk_write_queue);
+ }
+ return skb;
+}
+
+bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
+{
+ /* check if exactly one bit is set */
+ if (!param || (param & (param - 1)))
+ return false;
+ return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
+}