diff options
Diffstat (limited to 'net/dccp')
-rw-r--r-- | net/dccp/Makefile | 4 | ||||
-rw-r--r-- | net/dccp/ackvec.c | 415 | ||||
-rw-r--r-- | net/dccp/ackvec.h | 38 | ||||
-rw-r--r-- | net/dccp/ccids/ccid2.c | 134 | ||||
-rw-r--r-- | net/dccp/ccids/ccid2.h | 2 | ||||
-rw-r--r-- | net/dccp/dccp.h | 21 | ||||
-rw-r--r-- | net/dccp/input.c | 34 | ||||
-rw-r--r-- | net/dccp/ipv4.c | 13 | ||||
-rw-r--r-- | net/dccp/options.c | 43 | ||||
-rw-r--r-- | net/dccp/output.c | 22 | ||||
-rw-r--r-- | net/dccp/proto.c | 71 | ||||
-rw-r--r-- | net/dccp/qpolicy.c | 137 |
12 files changed, 566 insertions, 368 deletions
diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 2991efcc8dea..5c8362b037ed 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -1,7 +1,7 @@ obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o -dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o - +dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \ + qpolicy.o # # CCID algorithms to be used by dccp.ko # diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index abaf241c7353..25b7a8d1ad58 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -9,18 +9,10 @@ * under the terms of the GNU General Public License as published by the * Free Software Foundation; version 2 of the License; */ - -#include "ackvec.h" #include "dccp.h" - -#include <linux/init.h> -#include <linux/errno.h> #include <linux/kernel.h> -#include <linux/skbuff.h> #include <linux/slab.h> -#include <net/sock.h> - static struct kmem_cache *dccp_ackvec_slab; static struct kmem_cache *dccp_ackvec_record_slab; @@ -92,6 +84,24 @@ int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) return 0; } +static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, + const u64 ackno) +{ + struct dccp_ackvec_record *avr; + /* + * Exploit that records are inserted in descending order of sequence + * number, start with the oldest record first. If @ackno is `before' + * the earliest ack_ackno, the packet is too old to be considered. + */ + list_for_each_entry_reverse(avr, av_list, avr_node) { + if (avr->avr_ack_seqno == ackno) + return avr; + if (before48(ackno, avr->avr_ack_seqno)) + break; + } + return NULL; +} + /* * Buffer index and length computation using modulo-buffersize arithmetic. * Note that, as pointers move from right to left, head is `before' tail. @@ -113,248 +123,253 @@ u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); } -/* - * If several packets are missing, the HC-Receiver may prefer to enter multiple - * bytes with run length 0, rather than a single byte with a larger run length; - * this simplifies table updates if one of the missing packets arrives. +/** + * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 + * @av: non-empty buffer to update + * @distance: negative or zero distance of @seqno from buf_ackno downward + * @seqno: the (old) sequence number whose record is to be updated + * @state: state in which packet carrying @seqno was received */ -static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, - const unsigned int packets, - const unsigned char state) +static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, + u64 seqno, enum dccp_ackvec_states state) { - long gap; - long new_head; + u16 ptr = av->av_buf_head; - if (av->av_vec_len + packets > DCCPAV_MAX_ACKVEC_LEN) - return -ENOBUFS; + BUG_ON(distance > 0); + if (unlikely(dccp_ackvec_is_empty(av))) + return; - gap = packets - 1; - new_head = av->av_buf_head - packets; + do { + u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); - if (new_head < 0) { - if (gap > 0) { - memset(av->av_buf, DCCPAV_NOT_RECEIVED, - gap + new_head + 1); - gap = -new_head; + if (distance + runlen >= 0) { + /* + * Only update the state if packet has not been received + * yet. This is OK as per the second table in RFC 4340, + * 11.4.1; i.e. here we are using the following table: + * RECEIVED + * 0 1 3 + * S +---+---+---+ + * T 0 | 0 | 0 | 0 | + * O +---+---+---+ + * R 1 | 1 | 1 | 1 | + * E +---+---+---+ + * D 3 | 0 | 1 | 3 | + * +---+---+---+ + * The "Not Received" state was set by reserve_seats(). + */ + if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) + av->av_buf[ptr] = state; + else + dccp_pr_debug("Not changing %llu state to %u\n", + (unsigned long long)seqno, state); + break; } - new_head += DCCPAV_MAX_ACKVEC_LEN; - } - av->av_buf_head = new_head; + distance += runlen + 1; + ptr = __ackvec_idx_add(ptr, 1); - if (gap > 0) - memset(av->av_buf + av->av_buf_head + 1, - DCCPAV_NOT_RECEIVED, gap); + } while (ptr != av->av_buf_tail); +} - av->av_buf[av->av_buf_head] = state; - av->av_vec_len += packets; - return 0; +/* Mark @num entries after buf_head as "Not yet received". */ +static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) +{ + u16 start = __ackvec_idx_add(av->av_buf_head, 1), + len = DCCPAV_MAX_ACKVEC_LEN - start; + + /* check for buffer wrap-around */ + if (num > len) { + memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); + start = 0; + num -= len; + } + if (num) + memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); } -/* - * Implements the RFC 4340, Appendix A +/** + * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer + * @av: container of buffer to update (can be empty or non-empty) + * @num_packets: number of packets to register (must be >= 1) + * @seqno: sequence number of the first packet in @num_packets + * @state: state in which packet carrying @seqno was received */ -int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, - const u64 ackno, const u8 state) +static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, + u64 seqno, enum dccp_ackvec_states state) { - u8 *cur_head = av->av_buf + av->av_buf_head, - *buf_end = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; - /* - * Check at the right places if the buffer is full, if it is, tell the - * caller to start dropping packets till the HC-Sender acks our ACK - * vectors, when we will free up space in av_buf. - * - * We may well decide to do buffer compression, etc, but for now lets - * just drop. - * - * From Appendix A.1.1 (`New Packets'): - * - * Of course, the circular buffer may overflow, either when the - * HC-Sender is sending data at a very high rate, when the - * HC-Receiver's acknowledgements are not reaching the HC-Sender, - * or when the HC-Sender is forgetting to acknowledge those acks - * (so the HC-Receiver is unable to clean up old state). In this - * case, the HC-Receiver should either compress the buffer (by - * increasing run lengths when possible), transfer its state to - * a larger buffer, or, as a last resort, drop all received - * packets, without processing them whatsoever, until its buffer - * shrinks again. - */ + u32 num_cells = num_packets; - /* See if this is the first ackno being inserted */ - if (av->av_vec_len == 0) { - *cur_head = state; - av->av_vec_len = 1; - } else if (after48(ackno, av->av_buf_ackno)) { - const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno); + if (num_packets > DCCPAV_BURST_THRESH) { + u32 lost_packets = num_packets - 1; + DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); /* - * Look if the state of this packet is the same as the - * previous ackno and if so if we can bump the head len. + * We received 1 packet and have a loss of size "num_packets-1" + * which we squeeze into num_cells-1 rather than reserving an + * entire byte for each lost packet. + * The reason is that the vector grows in O(burst_length); when + * it grows too large there will no room left for the payload. + * This is a trade-off: if a few packets out of the burst show + * up later, their state will not be changed; it is simply too + * costly to reshuffle/reallocate/copy the buffer each time. + * Should such problems persist, we will need to switch to a + * different underlying data structure. */ - if (delta == 1 && dccp_ackvec_state(cur_head) == state && - dccp_ackvec_runlen(cur_head) < DCCPAV_MAX_RUNLEN) - *cur_head += 1; - else if (dccp_ackvec_set_buf_head_state(av, delta, state)) - return -ENOBUFS; - } else { - /* - * A.1.2. Old Packets - * - * When a packet with Sequence Number S <= buf_ackno - * arrives, the HC-Receiver will scan the table for - * the byte corresponding to S. (Indexing structures - * could reduce the complexity of this scan.) - */ - u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno); + for (num_packets = num_cells = 1; lost_packets; ++num_cells) { + u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN); - while (1) { - const u8 len = dccp_ackvec_runlen(cur_head); - /* - * valid packets not yet in av_buf have a reserved - * entry, with a len equal to 0. - */ - if (*cur_head == DCCPAV_NOT_RECEIVED && delta == 0) { - dccp_pr_debug("Found %llu reserved seat!\n", - (unsigned long long)ackno); - *cur_head = state; - goto out; - } - /* len == 0 means one packet */ - if (delta < len + 1) - goto out_duplicate; - - delta -= len + 1; - if (++cur_head == buf_end) - cur_head = av->av_buf; + av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); + av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; + + lost_packets -= len; } } - av->av_buf_ackno = ackno; -out: - return 0; + if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { + DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n"); + av->av_overflow = true; + } + + av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); + if (av->av_overflow) + av->av_buf_tail = av->av_buf_head; + + av->av_buf[av->av_buf_head] = state; + av->av_buf_ackno = seqno; -out_duplicate: - /* Duplicate packet */ - dccp_pr_debug("Received a dup or already considered lost " - "packet: %llu\n", (unsigned long long)ackno); - return -EILSEQ; + if (num_packets > 1) + dccp_ackvec_reserve_seats(av, num_packets - 1); } -static void dccp_ackvec_throw_record(struct dccp_ackvec *av, - struct dccp_ackvec_record *avr) +/** + * dccp_ackvec_input - Register incoming packet in the buffer + */ +void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) { - struct dccp_ackvec_record *next; + u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; + enum dccp_ackvec_states state = DCCPAV_RECEIVED; - /* sort out vector length */ - if (av->av_buf_head <= avr->avr_ack_ptr) - av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; - else - av->av_vec_len = DCCPAV_MAX_ACKVEC_LEN - 1 - - av->av_buf_head + avr->avr_ack_ptr; + if (dccp_ackvec_is_empty(av)) { + dccp_ackvec_add_new(av, 1, seqno, state); + av->av_tail_ackno = seqno; - /* free records */ - list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { - list_del(&avr->avr_node); - kmem_cache_free(dccp_ackvec_record_slab, avr); - } -} + } else { + s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); + u8 *current_head = av->av_buf + av->av_buf_head; -void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, - const u64 ackno) -{ - struct dccp_ackvec_record *avr; + if (num_packets == 1 && + dccp_ackvec_state(current_head) == state && + dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { - /* - * If we traverse backwards, it should be faster when we have large - * windows. We will be receiving ACKs for stuff we sent a while back - * -sorbo. - */ - list_for_each_entry_reverse(avr, &av->av_records, avr_node) { - if (ackno == avr->avr_ack_seqno) { - dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " - "ack_ackno=%llu, ACKED!\n", - dccp_role(sk), avr->avr_ack_runlen, - (unsigned long long)avr->avr_ack_seqno, - (unsigned long long)avr->avr_ack_ackno); - dccp_ackvec_throw_record(av, avr); - break; - } else if (avr->avr_ack_seqno > ackno) - break; /* old news */ + *current_head += 1; + av->av_buf_ackno = seqno; + + } else if (num_packets > 0) { + dccp_ackvec_add_new(av, num_packets, seqno, state); + } else { + dccp_ackvec_update_old(av, num_packets, seqno, state); + } } } -static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, - struct sock *sk, u64 *ackno, - const unsigned char len, - const unsigned char *vector) +/** + * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection + * This routine is called when the peer acknowledges the receipt of Ack Vectors + * up to and including @ackno. While based on on section A.3 of RFC 4340, here + * are additional precautions to prevent corrupted buffer state. In particular, + * we use tail_ackno to identify outdated records; it always marks the earliest + * packet of group (2) in 11.4.2. + */ +void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) { - unsigned char i; - struct dccp_ackvec_record *avr; + struct dccp_ackvec_record *avr, *next; + u8 runlen_now, eff_runlen; + s64 delta; - /* Check if we actually sent an ACK vector */ - if (list_empty(&av->av_records)) + avr = dccp_ackvec_lookup(&av->av_records, ackno); + if (avr == NULL) return; + /* + * Deal with outdated acknowledgments: this arises when e.g. there are + * several old records and the acks from the peer come in slowly. In + * that case we may still have records that pre-date tail_ackno. + */ + delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); + if (delta < 0) + goto free_records; + /* + * Deal with overlapping Ack Vectors: don't subtract more than the + * number of packets between tail_ackno and ack_ackno. + */ + eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; - i = len; + runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); /* - * XXX - * I think it might be more efficient to work backwards. See comment on - * rcv_ackno. -sorbo. + * The run length of Ack Vector cells does not decrease over time. If + * the run length is the same as at the time the Ack Vector was sent, we + * free the ack_ptr cell. That cell can however not be freed if the run + * length has increased: in this case we need to move the tail pointer + * backwards (towards higher indices), to its next-oldest neighbour. */ - avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); - while (i--) { - const u8 rl = dccp_ackvec_runlen(vector); - u64 ackno_end_rl; + if (runlen_now > eff_runlen) { - dccp_set_seqno(&ackno_end_rl, *ackno - rl); + av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; + av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); + /* This move may not have cleared the overflow flag. */ + if (av->av_overflow) + av->av_overflow = (av->av_buf_head == av->av_buf_tail); + } else { + av->av_buf_tail = avr->avr_ack_ptr; /* - * If our AVR sequence number is greater than the ack, go - * forward in the AVR list until it is not so. + * We have made sure that avr points to a valid cell within the + * buffer. This cell is either older than head, or equals head + * (empty buffer): in both cases we no longer have any overflow. */ - list_for_each_entry_from(avr, &av->av_records, avr_node) { - if (!after48(avr->avr_ack_seqno, *ackno)) - goto found; - } - /* End of the av_records list, not found, exit */ - break; -found: - if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) { - if (dccp_ackvec_state(vector) != DCCPAV_NOT_RECEIVED) { - dccp_pr_debug("%s ACK vector 0, len=%d, " - "ack_seqno=%llu, ack_ackno=%llu, " - "ACKED!\n", - dccp_role(sk), len, - (unsigned long long) - avr->avr_ack_seqno, - (unsigned long long) - avr->avr_ack_ackno); - dccp_ackvec_throw_record(av, avr); - break; - } - /* - * If it wasn't received, continue scanning... we might - * find another one. - */ - } + av->av_overflow = 0; + } - dccp_set_seqno(ackno, ackno_end_rl - 1); - ++vector; + /* + * The peer has acknowledged up to and including ack_ackno. Hence the + * first packet in group (2) of 11.4.2 is the successor of ack_ackno. + */ + av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); + +free_records: + list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { + list_del(&avr->avr_node); + kmem_cache_free(dccp_ackvec_record_slab, avr); } } -int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, - u64 *ackno, const u8 opt, const u8 *value, const u8 len) +/* + * Routines to keep track of Ack Vectors received in an skb + */ +int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) { - if (len > DCCP_SINGLE_OPT_MAXLEN) - return -1; + struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); + + if (new == NULL) + return -ENOBUFS; + new->vec = vec; + new->len = len; + new->nonce = nonce; - /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ - dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk, - ackno, len, value); + list_add_tail(&new->node, head); return 0; } +EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); + +void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) +{ + struct dccp_ackvec_parsed *cur, *next; + + list_for_each_entry_safe(cur, next, parsed_chunks, node) + kfree(cur); + INIT_LIST_HEAD(parsed_chunks); +} +EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); int __init dccp_ackvec_init(void) { diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index 23880be8fc29..e2ab0627a5ff 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -29,6 +29,9 @@ /* Estimated minimum average Ack Vector length - used for updating MPS */ #define DCCPAV_MIN_OPTLEN 16 +/* Threshold for coping with large bursts of losses */ +#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) + enum dccp_ackvec_states { DCCPAV_RECEIVED = 0x00, DCCPAV_ECN_MARKED = 0x40, @@ -61,7 +64,6 @@ static inline u8 dccp_ackvec_state(const u8 *cell) * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) - * @av_veclen: length of the live portion of @av_buf */ struct dccp_ackvec { u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; @@ -72,7 +74,6 @@ struct dccp_ackvec { bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; u8 av_overflow:1; struct list_head av_records; - u16 av_vec_len; }; /** struct dccp_ackvec_record - Records information about sent Ack Vectors @@ -98,29 +99,38 @@ struct dccp_ackvec_record { u8 avr_ack_nonce:1; }; -struct sock; -struct sk_buff; - extern int dccp_ackvec_init(void); extern void dccp_ackvec_exit(void); extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); extern void dccp_ackvec_free(struct dccp_ackvec *av); -extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, - const u64 ackno, const u8 state); - -extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, - struct sock *sk, const u64 ackno); -extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, - u64 *ackno, const u8 opt, - const u8 *value, const u8 len); - +extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); +extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) { return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; } + +/** + * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb + * @vec: start of vector (offset into skb) + * @len: length of @vec + * @nonce: whether @vec had an ECN nonce of 0 or 1 + * @node: FIFO - arranged in descending order of ack_ackno + * This structure is used by CCIDs to access Ack Vectors in a received skb. + */ +struct dccp_ackvec_parsed { + u8 *vec, + len, + nonce:1; + struct list_head node; +}; + +extern int dccp_ackvec_parsed_add(struct list_head *head, + u8 *vec, u8 len, u8 nonce); +extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); #endif /* _ACKVEC_H */ diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index cb1b4a0d1877..e96d5e810039 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -246,68 +246,6 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) #endif } -/* XXX Lame code duplication! - * returns -1 if none was found. - * else returns the next offset to use in the function call. - */ -static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset, - unsigned char **vec, unsigned char *veclen) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); - unsigned char *opt_ptr; - const unsigned char *opt_end = (unsigned char *)dh + - (dh->dccph_doff * 4); - unsigned char opt, len; - unsigned char *value; - - BUG_ON(offset < 0); - options += offset; - opt_ptr = options; - if (opt_ptr >= opt_end) - return -1; - - while (opt_ptr != opt_end) { - opt = *opt_ptr++; - len = 0; - value = NULL; - - /* Check if this isn't a single byte option */ - if (opt > DCCPO_MAX_RESERVED) { - if (opt_ptr == opt_end) - goto out_invalid_option; - - len = *opt_ptr++; - if (len < 3) - goto out_invalid_option; - /* - * Remove the type and len fields, leaving - * just the value size - */ - len -= 2; - value = opt_ptr; - opt_ptr += len; - - if (opt_ptr > opt_end) - goto out_invalid_option; - } - - switch (opt) { - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - *vec = value; - *veclen = len; - return offset + (opt_ptr - options); - } - } - - return -1; - -out_invalid_option: - DCCP_BUG("Invalid option - this should not happen (previous parsing)!"); - return -1; -} - /** * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm * This code is almost identical with TCP's tcp_rtt_estimator(), since @@ -432,16 +370,28 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) ccid2_change_l_ack_ratio(sk, hc->tx_cwnd); } +static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, + u8 option, u8 *optval, u8 optlen) +{ + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + + switch (option) { + case DCCPO_ACK_VECTOR_0: + case DCCPO_ACK_VECTOR_1: + return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen, + option - DCCPO_ACK_VECTOR_0); + } + return 0; +} + static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); + struct dccp_ackvec_parsed *avp; u64 ackno, seqno; struct ccid2_seq *seqp; - unsigned char *vector; - unsigned char veclen; - int offset = 0; int done = 0; unsigned int maxincr = 0; @@ -475,17 +425,12 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } /* check forward path congestion */ - /* still didn't send out new data packets */ - if (hc->tx_seqh == hc->tx_seqt) + if (dccp_packet_without_ack(skb)) return; - switch (DCCP_SKB_CB(skb)->dccpd_type) { - case DCCP_PKT_ACK: - case DCCP_PKT_DATAACK: - break; - default: - return; - } + /* still didn't send out new data packets */ + if (hc->tx_seqh == hc->tx_seqt) + goto done; ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; if (after48(ackno, hc->tx_high_ack)) @@ -509,15 +454,16 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); /* go through all ack vectors */ - while ((offset = ccid2_ackvector(sk, skb, offset, - &vector, &veclen)) != -1) { + list_for_each_entry(avp, &hc->tx_av_chunks, node) { /* go through this ack vector */ - while (veclen--) { - u64 ackno_end_rl = SUB48(ackno, dccp_ackvec_runlen(vector)); + for (; avp->len--; avp->vec++) { + u64 ackno_end_rl = SUB48(ackno, + dccp_ackvec_runlen(avp->vec)); - ccid2_pr_debug("ackvec start:%llu end:%llu\n", + ccid2_pr_debug("ackvec %llu |%u,%u|\n", (unsigned long long)ackno, - (unsigned long long)ackno_end_rl); + dccp_ackvec_state(avp->vec) >> 6, + dccp_ackvec_runlen(avp->vec)); /* if the seqno we are analyzing is larger than the * current ackno, then move towards the tail of our * seqnos. @@ -536,7 +482,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * run length */ while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { - const u8 state = dccp_ackvec_state(vector); + const u8 state = dccp_ackvec_state(avp->vec); /* new packet received or marked */ if (state != DCCPAV_NOT_RECEIVED && @@ -563,7 +509,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) break; ackno = SUB48(ackno_end_rl, 1); - vector++; } if (done) break; @@ -631,10 +576,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) sk_stop_timer(sk, &hc->tx_rtotimer); else sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); - +done: /* check if incoming Acks allow pending packets to be sent */ if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); + dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) @@ -663,6 +609,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) hc->tx_last_cong = ccid2_time_stamp; setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); + INIT_LIST_HEAD(&hc->tx_av_chunks); return 0; } @@ -696,16 +643,17 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) } struct ccid_operations ccid2_ops = { - .ccid_id = DCCPC_CCID2, - .ccid_name = "TCP-like", - .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), - .ccid_hc_tx_init = ccid2_hc_tx_init, - .ccid_hc_tx_exit = ccid2_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, - .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, - .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), - .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, + .ccid_id = DCCPC_CCID2, + .ccid_name = "TCP-like", + .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), + .ccid_hc_tx_init = ccid2_hc_tx_init, + .ccid_hc_tx_exit = ccid2_hc_tx_exit, + .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, + .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, + .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, + .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, + .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), + .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, }; #ifdef CONFIG_IP_DCCP_CCID2_DEBUG diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 25cb6b216eda..e9985dafc2c7 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -55,6 +55,7 @@ struct ccid2_seq { * @tx_rtt_seq: to decay RTTVAR at most once per flight * @tx_rpseq: last consecutive seqno * @tx_rpdupack: dupacks since rpseq + * @tx_av_chunks: list of Ack Vectors received on current skb */ struct ccid2_hc_tx_sock { u32 tx_cwnd; @@ -79,6 +80,7 @@ struct ccid2_hc_tx_sock { int tx_rpdupack; u32 tx_last_cong; u64 tx_high_ack; + struct list_head tx_av_chunks; }; static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 19fafd597465..45087052d894 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -93,9 +93,6 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); #define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) #define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) -/* Maximal interval between probes for local resources. */ -#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U)) - /* sysctl variables for DCCP */ extern int sysctl_dccp_request_retries; extern int sysctl_dccp_retries1; @@ -203,12 +200,7 @@ struct dccp_mib { DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); #define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) #define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field) -#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field) #define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) -#define DCCP_ADD_STATS_BH(field, val) \ - SNMP_ADD_STATS_BH(dccp_statistics, field, val) -#define DCCP_ADD_STATS_USER(field, val) \ - SNMP_ADD_STATS_USER(dccp_statistics, field, val) /* * Checksumming routines @@ -243,6 +235,19 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, extern void dccp_send_sync(struct sock *sk, const u64 seq, const enum dccp_pkt_type pkt_type); +/* + * TX Packet Dequeueing Interface + */ +extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); +extern bool dccp_qpolicy_full(struct sock *sk); +extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); +extern struct sk_buff *dccp_qpolicy_top(struct sock *sk); +extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk); +extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); + +/* + * TX Packet Output and TX Timers + */ extern void dccp_write_xmit(struct sock *sk); extern void dccp_write_space(struct sock *sk); extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); diff --git a/net/dccp/input.c b/net/dccp/input.c index c7aeeba859d4..15af247ea007 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -160,13 +160,15 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) dccp_time_wait(sk, DCCP_TIME_WAIT, 0); } -static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) +static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) { - struct dccp_sock *dp = dccp_sk(sk); + struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; - if (dp->dccps_hc_rx_ackvec != NULL) - dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_ack_seq); + if (av == NULL) + return; + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); + dccp_ackvec_input(av, skb); } static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) @@ -239,7 +241,8 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) dccp_update_gsr(sk, seqno); if (dh->dccph_type != DCCP_PKT_SYNC && - (ackno != DCCP_PKT_WITHOUT_ACK_SEQ)) + ackno != DCCP_PKT_WITHOUT_ACK_SEQ && + after48(ackno, dp->dccps_gar)) dp->dccps_gar = ackno; } else { unsigned long now = jiffies; @@ -365,21 +368,13 @@ discard: int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned len) { - struct dccp_sock *dp = dccp_sk(sk); - if (dccp_check_seqno(sk, skb)) goto discard; if (dccp_parse_options(sk, NULL, skb)) return 1; - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_event_ack_recv(sk, skb); - - if (dp->dccps_hc_rx_ackvec != NULL && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED)) - goto discard; + dccp_handle_ackvec_processing(sk, skb); dccp_deliver_input_to_ccids(sk, skb); return __dccp_rcv_established(sk, skb, dh, len); @@ -631,14 +626,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dccp_parse_options(sk, NULL, skb)) return 1; - if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_event_ack_recv(sk, skb); - - if (dp->dccps_hc_rx_ackvec != NULL && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED)) - goto discard; - + dccp_handle_ackvec_processing(sk, skb); dccp_deliver_input_to_ccids(sk, skb); } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 3f69ea114829..45a434f94169 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -462,15 +462,12 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, { struct rtable *rt; struct flowi fl = { .oif = skb_rtable(skb)->rt_iif, - .nl_u = { .ip4_u = - { .daddr = ip_hdr(skb)->saddr, - .saddr = ip_hdr(skb)->daddr, - .tos = RT_CONN_FLAGS(sk) } }, + .fl4_dst = ip_hdr(skb)->saddr, + .fl4_src = ip_hdr(skb)->daddr, + .fl4_tos = RT_CONN_FLAGS(sk), .proto = sk->sk_protocol, - .uli_u = { .ports = - { .sport = dccp_hdr(skb)->dccph_dport, - .dport = dccp_hdr(skb)->dccph_sport } - } + .fl_ip_sport = dccp_hdr(skb)->dccph_dport, + .fl_ip_dport = dccp_hdr(skb)->dccph_sport }; security_skb_classify_flow(skb, &fl); diff --git a/net/dccp/options.c b/net/dccp/options.c index 5adeeed5e0d2..f06ffcfc8d71 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -54,7 +54,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, struct dccp_sock *dp = dccp_sk(sk); const struct dccp_hdr *dh = dccp_hdr(skb); const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; - u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); unsigned char *opt_ptr = options; const unsigned char *opt_end = (unsigned char *)dh + @@ -129,14 +128,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, if (rc) goto out_featneg_failed; break; - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ - break; - if (dp->dccps_hc_rx_ackvec != NULL && - dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) - goto out_invalid_option; - break; case DCCPO_TIMESTAMP: if (len != 4) goto out_invalid_option; @@ -226,6 +217,16 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, pkt_type, opt, value, len)) goto out_invalid_option; break; + case DCCPO_ACK_VECTOR_0: + case DCCPO_ACK_VECTOR_1: + if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ + break; + /* + * Ack vectors are processed by the TX CCID if it is + * interested. The RX CCID need not parse Ack Vectors, + * since it is only interested in clearing old state. + * Fall through. + */ case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, pkt_type, opt, value, len)) @@ -429,6 +430,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); const u16 buflen = dccp_ackvec_buflen(av); /* Figure out how many options do we need to represent the ackvec */ const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); @@ -437,10 +439,25 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) const unsigned char *tail, *from; unsigned char *to; - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) + if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { + DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, + dccp_packet_name(dcb->dccpd_type)); return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; + } + /* + * Since Ack Vectors are variable-length, we can not always predict + * their size. To catch exception cases where the space is running out + * on the skb, a separate Sync is scheduled to carry the Ack Vector. + */ + if (len > DCCPAV_MIN_OPTLEN && + len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { + DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " + "MPS=%u ==> reduce payload size?\n", len, skb->len, + dcb->dccpd_opt_len, dp->dccps_mss_cache); + dp->dccps_sync_scheduled = 1; + return 0; + } + dcb->dccpd_opt_len += len; to = skb_push(skb, len); len = buflen; @@ -481,7 +498,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) /* * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. */ - if (dccp_ackvec_update_records(av, DCCP_SKB_CB(skb)->dccpd_seq, nonce)) + if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) return -ENOBUFS; return 0; } diff --git a/net/dccp/output.c b/net/dccp/output.c index 45b91853f5ae..784d30210543 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -242,7 +242,7 @@ static void dccp_xmit_packet(struct sock *sk) { int err, len; struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue); + struct sk_buff *skb = dccp_qpolicy_pop(sk); if (unlikely(skb == NULL)) return; @@ -283,6 +283,15 @@ static void dccp_xmit_packet(struct sock *sk) * any local drop will eventually be reported via receiver feedback. */ ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); + + /* + * If the CCID needs to transfer additional header options out-of-band + * (e.g. Ack Vectors or feature-negotiation options), it activates this + * flag to schedule a Sync. The Sync will automatically incorporate all + * currently pending header options, thus clearing the backlog. + */ + if (dp->dccps_sync_scheduled) + dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); } /** @@ -336,7 +345,7 @@ void dccp_write_xmit(struct sock *sk) struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; - while ((skb = skb_peek(&sk->sk_write_queue))) { + while ((skb = dccp_qpolicy_top(sk))) { int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); switch (ccid_packet_dequeue_eval(rc)) { @@ -350,8 +359,7 @@ void dccp_write_xmit(struct sock *sk) dccp_xmit_packet(sk); break; case CCID_PACKET_ERR: - skb_dequeue(&sk->sk_write_queue); - kfree_skb(skb); + dccp_qpolicy_drop(sk, skb); dccp_pr_debug("packet discarded due to err=%d\n", rc); } } @@ -636,6 +644,12 @@ void dccp_send_sync(struct sock *sk, const u64 ackno, DCCP_SKB_CB(skb)->dccpd_type = pkt_type; DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; + /* + * Clear the flag in case the Sync was scheduled for out-of-band data, + * such as carrying a long Ack Vector. + */ + dccp_sk(sk)->dccps_sync_scheduled = 0; + dccp_transmit_skb(sk, skb); } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index ef343d53fcea..152975d942d9 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -185,6 +185,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; + dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; dccp_init_xmit_timers(sk); @@ -532,6 +533,20 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_RECV_CSCOV: err = dccp_setsockopt_cscov(sk, val, true); break; + case DCCP_SOCKOPT_QPOLICY_ID: + if (sk->sk_state != DCCP_CLOSED) + err = -EISCONN; + else if (val < 0 || val >= DCCPQ_POLICY_MAX) + err = -EINVAL; + else + dp->dccps_qpolicy = val; + break; + case DCCP_SOCKOPT_QPOLICY_TXQLEN: + if (val < 0) + err = -EINVAL; + else + dp->dccps_tx_qlen = val; + break; default: err = -ENOPROTOOPT; break; @@ -639,6 +654,12 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_RECV_CSCOV: val = dp->dccps_pcrlen; break; + case DCCP_SOCKOPT_QPOLICY_ID: + val = dp->dccps_qpolicy; + break; + case DCCP_SOCKOPT_QPOLICY_TXQLEN: + val = dp->dccps_tx_qlen; + break; case 128 ... 191: return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, len, (u32 __user *)optval, optlen); @@ -681,6 +702,47 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); #endif +static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) +{ + struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); + + /* + * Assign an (opaque) qpolicy priority value to skb->priority. + * + * We are overloading this skb field for use with the qpolicy subystem. + * The skb->priority is normally used for the SO_PRIORITY option, which + * is initialised from sk_priority. Since the assignment of sk_priority + * to skb->priority happens later (on layer 3), we overload this field + * for use with queueing priorities as long as the skb is on layer 4. + * The default priority value (if nothing is set) is 0. + */ + skb->priority = 0; + + for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { + + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_DCCP) + continue; + + if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && + !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) + return -EINVAL; + + switch (cmsg->cmsg_type) { + case DCCP_SCM_PRIORITY: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) + return -EINVAL; + skb->priority = *(__u32 *)CMSG_DATA(cmsg); + break; + default: + return -EINVAL; + } + } + return 0; +} + int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -696,8 +758,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, lock_sock(sk); - if (sysctl_dccp_tx_qlen && - (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) { + if (dccp_qpolicy_full(sk)) { rc = -EAGAIN; goto out_release; } @@ -725,7 +786,11 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (rc != 0) goto out_discard; - skb_queue_tail(&sk->sk_write_queue, skb); + rc = dccp_msghdr_parse(msg, skb); + if (rc != 0) + goto out_discard; + + dccp_qpolicy_push(sk, skb); /* * The xmit_timer is set if the TX CCID is rate-based and will expire * when congestion control permits to release further packets into the diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c new file mode 100644 index 000000000000..63c30bfa4703 --- /dev/null +++ b/net/dccp/qpolicy.c @@ -0,0 +1,137 @@ +/* + * net/dccp/qpolicy.c + * + * Policy-based packet dequeueing interface for DCCP. + * + * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License v2 + * as published by the Free Software Foundation. + */ +#include "dccp.h" + +/* + * Simple Dequeueing Policy: + * If tx_qlen is different from 0, enqueue up to tx_qlen elements. + */ +static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) +{ + skb_queue_tail(&sk->sk_write_queue, skb); +} + +static bool qpolicy_simple_full(struct sock *sk) +{ + return dccp_sk(sk)->dccps_tx_qlen && + sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; +} + +static struct sk_buff *qpolicy_simple_top(struct sock *sk) +{ + return skb_peek(&sk->sk_write_queue); +} + +/* + * Priority-based Dequeueing Policy: + * If tx_qlen is different from 0 and the queue has reached its upper bound + * of tx_qlen elements, replace older packets lowest-priority-first. + */ +static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) +{ + struct sk_buff *skb, *best = NULL; + + skb_queue_walk(&sk->sk_write_queue, skb) + if (best == NULL || skb->priority > best->priority) + best = skb; + return best; +} + +static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) +{ + struct sk_buff *skb, *worst = NULL; + + skb_queue_walk(&sk->sk_write_queue, skb) + if (worst == NULL || skb->priority < worst->priority) + worst = skb; + return worst; +} + +static bool qpolicy_prio_full(struct sock *sk) +{ + if (qpolicy_simple_full(sk)) + dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); + return false; +} + +/** + * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface + * @push: add a new @skb to the write queue + * @full: indicates that no more packets will be admitted + * @top: peeks at whatever the queueing policy defines as its `top' + */ +static struct dccp_qpolicy_operations { + void (*push) (struct sock *sk, struct sk_buff *skb); + bool (*full) (struct sock *sk); + struct sk_buff* (*top) (struct sock *sk); + __be32 params; + +} qpol_table[DCCPQ_POLICY_MAX] = { + [DCCPQ_POLICY_SIMPLE] = { + .push = qpolicy_simple_push, + .full = qpolicy_simple_full, + .top = qpolicy_simple_top, + .params = 0, + }, + [DCCPQ_POLICY_PRIO] = { + .push = qpolicy_simple_push, + .full = qpolicy_prio_full, + .top = qpolicy_prio_best_skb, + .params = DCCP_SCM_PRIORITY, + }, +}; + +/* + * Externally visible interface + */ +void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) +{ + qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); +} + +bool dccp_qpolicy_full(struct sock *sk) +{ + return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); +} + +void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) +{ + if (skb != NULL) { + skb_unlink(skb, &sk->sk_write_queue); + kfree_skb(skb); + } +} + +struct sk_buff *dccp_qpolicy_top(struct sock *sk) +{ + return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); +} + +struct sk_buff *dccp_qpolicy_pop(struct sock *sk) +{ + struct sk_buff *skb = dccp_qpolicy_top(sk); + + if (skb != NULL) { + /* Clear any skb fields that we used internally */ + skb->priority = 0; + skb_unlink(skb, &sk->sk_write_queue); + } + return skb; +} + +bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) +{ + /* check if exactly one bit is set */ + if (!param || (param & (param - 1))) + return false; + return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; +} |