diff options
Diffstat (limited to 'net/ipv4/udp.c')
-rw-r--r-- | net/ipv4/udp.c | 245 |
1 files changed, 203 insertions, 42 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1d6219bf2d6b..25294d43e147 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -577,7 +577,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL); - if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } @@ -1163,23 +1163,62 @@ out: return ret; } +#if BITS_PER_LONG == 64 +static void udp_set_dev_scratch(struct sk_buff *skb) +{ + struct udp_dev_scratch *scratch; + + BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long)); + scratch = (struct udp_dev_scratch *)&skb->dev_scratch; + scratch->truesize = skb->truesize; + scratch->len = skb->len; + scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); + scratch->is_linear = !skb_is_nonlinear(skb); +} + +static int udp_skb_truesize(struct sk_buff *skb) +{ + return ((struct udp_dev_scratch *)&skb->dev_scratch)->truesize; +} +#else +static void udp_set_dev_scratch(struct sk_buff *skb) +{ + skb->dev_scratch = skb->truesize; +} + +static int udp_skb_truesize(struct sk_buff *skb) +{ + return skb->dev_scratch; +} +#endif + /* fully reclaim rmem/fwd memory allocated for skb */ -static void udp_rmem_release(struct sock *sk, int size, int partial) +static void udp_rmem_release(struct sock *sk, int size, int partial, + bool rx_queue_lock_held) { struct udp_sock *up = udp_sk(sk); + struct sk_buff_head *sk_queue; int amt; if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; if (size < (sk->sk_rcvbuf >> 2) && - !skb_queue_empty(&sk->sk_receive_queue)) + !skb_queue_empty(&up->reader_queue)) return; } else { size += up->forward_deficit; } up->forward_deficit = 0; + /* acquire the sk_receive_queue for fwd allocated memory scheduling, + * if the called don't held it already + */ + sk_queue = &sk->sk_receive_queue; + if (!rx_queue_lock_held) + spin_lock(&sk_queue->lock); + + sk->sk_forward_alloc += size; amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1); sk->sk_forward_alloc -= amt; @@ -1188,19 +1227,33 @@ static void udp_rmem_release(struct sock *sk, int size, int partial) __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT); atomic_sub(size, &sk->sk_rmem_alloc); + + /* this can save us from acquiring the rx queue lock on next receive */ + skb_queue_splice_tail_init(sk_queue, &up->reader_queue); + + if (!rx_queue_lock_held) + spin_unlock(&sk_queue->lock); } -/* Note: called with sk_receive_queue.lock held. +/* Note: called with reader_queue.lock held. * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch * This avoids a cache line miss while receive_queue lock is held. * Look at __udp_enqueue_schedule_skb() to find where this copy is done. */ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) { - udp_rmem_release(sk, skb->dev_scratch, 1); + prefetch(&skb->data); + udp_rmem_release(sk, udp_skb_truesize(skb), 1, false); } EXPORT_SYMBOL(udp_skb_destructor); +/* as above, but the caller held the rx queue lock, too */ +static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) +{ + prefetch(&skb->data); + udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); +} + /* Idea of busylocks is to let producers grab an extra spinlock * to relieve pressure on the receive_queue spinlock shared by consumer. * Under flood, this means that only one producer can be in line @@ -1252,10 +1305,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) busy = busylock_acquire(sk); } size = skb->truesize; - /* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss - * in udp_skb_destructor() - */ - skb->dev_scratch = size; + udp_set_dev_scratch(skb); /* we drop only if the receive buf is full and the receive * queue contains some other skb @@ -1306,14 +1356,16 @@ EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); void udp_destruct_sock(struct sock *sk) { /* reclaim completely the forward allocated memory */ + struct udp_sock *up = udp_sk(sk); unsigned int total = 0; struct sk_buff *skb; - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + skb_queue_splice_tail_init(&sk->sk_receive_queue, &up->reader_queue); + while ((skb = __skb_dequeue(&up->reader_queue)) != NULL) { total += skb->truesize; kfree_skb(skb); } - udp_rmem_release(sk, total, 0); + udp_rmem_release(sk, total, 0, true); inet_sock_destruct(sk); } @@ -1321,6 +1373,7 @@ EXPORT_SYMBOL_GPL(udp_destruct_sock); int udp_init_sock(struct sock *sk) { + skb_queue_head_init(&udp_sk(sk)->reader_queue); sk->sk_destruct = udp_destruct_sock; return 0; } @@ -1334,10 +1387,38 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) sk_peek_offset_bwd(sk, len); unlock_sock_fast(sk, slow); } - consume_skb(skb); + + consume_stateless_skb(skb); } EXPORT_SYMBOL_GPL(skb_consume_udp); +static struct sk_buff *__first_packet_length(struct sock *sk, + struct sk_buff_head *rcvq, + int *total) +{ + struct sk_buff *skb; + + while ((skb = skb_peek(rcvq)) != NULL) { + if (udp_lib_checksum_complete(skb)) { + __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, + IS_UDPLITE(sk)); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + atomic_inc(&sk->sk_drops); + __skb_unlink(skb, rcvq); + *total += skb->truesize; + kfree_skb(skb); + } else { + /* the csum related bits could be changed, refresh + * the scratch area + */ + udp_set_dev_scratch(skb); + break; + } + } + return skb; +} + /** * first_packet_length - return length of first packet in receive queue * @sk: socket @@ -1347,26 +1428,24 @@ EXPORT_SYMBOL_GPL(skb_consume_udp); */ static int first_packet_length(struct sock *sk) { - struct sk_buff_head *rcvq = &sk->sk_receive_queue; + struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue; + struct sk_buff_head *sk_queue = &sk->sk_receive_queue; struct sk_buff *skb; int total = 0; int res; spin_lock_bh(&rcvq->lock); - while ((skb = skb_peek(rcvq)) != NULL && - udp_lib_checksum_complete(skb)) { - __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, - IS_UDPLITE(sk)); - __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); - atomic_inc(&sk->sk_drops); - __skb_unlink(skb, rcvq); - total += skb->truesize; - kfree_skb(skb); + skb = __first_packet_length(sk, rcvq, &total); + if (!skb && !skb_queue_empty(sk_queue)) { + spin_lock(&sk_queue->lock); + skb_queue_splice_tail_init(sk_queue, rcvq); + spin_unlock(&sk_queue->lock); + + skb = __first_packet_length(sk, rcvq, &total); } res = skb ? skb->len : -1; if (total) - udp_rmem_release(sk, total, 1); + udp_rmem_release(sk, total, 1, false); spin_unlock_bh(&rcvq->lock); return res; } @@ -1400,6 +1479,77 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) } EXPORT_SYMBOL(udp_ioctl); +struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, + int noblock, int *peeked, int *off, int *err) +{ + struct sk_buff_head *sk_queue = &sk->sk_receive_queue; + struct sk_buff_head *queue; + struct sk_buff *last; + long timeo; + int error; + + queue = &udp_sk(sk)->reader_queue; + flags |= noblock ? MSG_DONTWAIT : 0; + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + do { + struct sk_buff *skb; + + error = sock_error(sk); + if (error) + break; + + error = -EAGAIN; + *peeked = 0; + do { + spin_lock_bh(&queue->lock); + skb = __skb_try_recv_from_queue(sk, queue, flags, + udp_skb_destructor, + peeked, off, err, + &last); + if (skb) { + spin_unlock_bh(&queue->lock); + return skb; + } + + if (skb_queue_empty(sk_queue)) { + spin_unlock_bh(&queue->lock); + goto busy_check; + } + + /* refill the reader queue and walk it again + * keep both queues locked to avoid re-acquiring + * the sk_receive_queue lock if fwd memory scheduling + * is needed. + */ + spin_lock(&sk_queue->lock); + skb_queue_splice_tail_init(sk_queue, queue); + + skb = __skb_try_recv_from_queue(sk, queue, flags, + udp_skb_dtor_locked, + peeked, off, err, + &last); + spin_unlock(&sk_queue->lock); + spin_unlock_bh(&queue->lock); + if (skb) + return skb; + +busy_check: + if (!sk_can_busy_loop(sk)) + break; + + sk_busy_loop(sk, flags & MSG_DONTWAIT); + } while (!skb_queue_empty(sk_queue)); + + /* sk_queue is empty, reader_queue may contain peeked packets */ + } while (timeo && + !__skb_wait_for_more_packets(sk, &error, &timeo, + (struct sk_buff *)sk_queue)); + + *err = error; + return NULL; +} +EXPORT_SYMBOL_GPL(__skb_recv_udp); + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -1426,7 +1576,7 @@ try_again: if (!skb) return err; - ulen = skb->len; + ulen = udp_skb_len(skb); copied = len; if (copied > ulen - off) copied = ulen - off; @@ -1441,14 +1591,18 @@ try_again: if (copied < ulen || peeking || (is_udplite && UDP_SKB_CB(skb)->partial_cov)) { - checksum_valid = !udp_lib_checksum_complete(skb); + checksum_valid = udp_skb_csum_unnecessary(skb) || + !__udp_lib_checksum_complete(skb); if (!checksum_valid) goto csum_copy_err; } - if (checksum_valid || skb_csum_unnecessary(skb)) - err = skb_copy_datagram_msg(skb, off, msg, copied); - else { + if (checksum_valid || udp_skb_csum_unnecessary(skb)) { + if (udp_skb_is_linear(skb)) + err = copy_linear_skb(skb, copied, off, &msg->msg_iter); + else + err = skb_copy_datagram_msg(skb, off, msg, copied); + } else { err = skb_copy_and_csum_datagram_msg(skb, off, msg); if (err == -EINVAL) @@ -1490,7 +1644,8 @@ try_again: return err; csum_copy_err: - if (!__sk_queue_drop_skb(sk, skb, flags, udp_skb_destructor)) { + if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags, + udp_skb_destructor)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } @@ -1624,6 +1779,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id_once(sk, skb); } + /* clear all pending head states while they are hot in the cache */ + skb_release_head_state(skb); + rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); @@ -1738,6 +1896,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } } + prefetch(&sk->sk_rmem_alloc); if (rcu_access_pointer(sk->sk_filter) && udp_lib_checksum_complete(skb)) goto csum_error; @@ -1766,9 +1925,10 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; - dst_hold(dst); - old = xchg(&sk->sk_rx_dst, dst); - dst_release(old); + if (dst_hold_safe(dst)) { + old = xchg(&sk->sk_rx_dst, dst); + dst_release(old); + } } /* @@ -2082,7 +2242,7 @@ void udp_v4_early_demux(struct sk_buff *skb) uh->source, iph->saddr, dif); } - if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2)) + if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) return; skb->sk = sk; @@ -2092,13 +2252,11 @@ void udp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst) { - /* DST_NOCACHE can not be used without taking a reference */ - if (dst->flags & DST_NOCACHE) { - if (likely(atomic_inc_not_zero(&dst->__refcnt))) - skb_dst_set(skb, dst); - } else { - skb_dst_set_noref(skb, dst); - } + /* set noref for now. + * any place which wants to hold dst has to call + * dst_hold_safe() + */ + skb_dst_set_noref(skb, dst); } } @@ -2325,6 +2483,9 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) unsigned int mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; + if (!skb_queue_empty(&udp_sk(sk)->reader_queue)) + mask |= POLLIN | POLLRDNORM; + sock_rps_record_flow(sk); /* Check for false positives due to checksum errors */ @@ -2530,7 +2691,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, 0, 0L, 0, from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), 0, sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp, + refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } |