From 0bfe649fbb1337400065fa47679b381b2ac845f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 16 Oct 2017 17:05:57 +0200 Subject: fq_impl: Properly enforce memory limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fq structure would fail to properly enforce the memory limit in the case where the packet being enqueued was bigger than the packet being removed to bring the memory usage down. So keep dropping packets until the memory usage is back below the limit. Also, fix the statistics for memory limit violations. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- include/net/fq_impl.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index 4e6131cd3f43..ac1a2317941e 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -146,6 +146,7 @@ static void fq_tin_enqueue(struct fq *fq, fq_flow_get_default_t get_default_func) { struct fq_flow *flow; + bool oom; lockdep_assert_held(&fq->lock); @@ -167,8 +168,8 @@ static void fq_tin_enqueue(struct fq *fq, } __skb_queue_tail(&flow->queue, skb); - - if (fq->backlog > fq->limit || fq->memory_usage > fq->memory_limit) { + oom = (fq->memory_usage > fq->memory_limit); + while (fq->backlog > fq->limit || oom) { flow = list_first_entry_or_null(&fq->backlogs, struct fq_flow, backlogchain); @@ -183,8 +184,10 @@ static void fq_tin_enqueue(struct fq *fq, flow->tin->overlimit++; fq->overlimit++; - if (fq->memory_usage > fq->memory_limit) + if (oom) { fq->overmemory++; + oom = (fq->memory_usage > fq->memory_limit); + } } } -- cgit v1.2.3 From 829385f08ae99740276cbd46c9db29764c519211 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 20 Oct 2017 16:40:43 -0700 Subject: strparser: Use delayed work instead of timer for msg timeout Sock lock may be taken in the message timer function which is a problem since timers run in BH. Instead of timers use delayed_work. Reported-by: Eric Dumazet Fixes: bbb03029a899 ("strparser: Generalize strparser") Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/strparser.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/strparser.h b/include/net/strparser.h index 7dc131d62ad5..d96b59f45eba 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -74,10 +74,9 @@ struct strparser { u32 unrecov_intr : 1; struct sk_buff **skb_nextp; - struct timer_list msg_timer; struct sk_buff *skb_head; unsigned int need_bytes; - struct delayed_work delayed_work; + struct delayed_work msg_timer_work; struct work_struct work; struct strp_stats stats; struct strp_callbacks cb; -- cgit v1.2.3 From be0f161ef141e4df368aa3f417a1c2ab9c362e75 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Thu, 28 Sep 2017 15:33:50 -0500 Subject: net/mlx5e: DCBNL, Implement tc with ets type and zero bandwidth Previously, tc with ets type and zero bandwidth is not accepted by driver. This behavior does not follow the IEEE802.1qaz spec. If there are tcs with ets type and zero bandwidth, these tcs are assigned to the lowest priority tc_group #0. We equally distribute 100% bw of the tc_group #0 to these zero bandwidth ets tcs. Also, the non zero bandwidth ets tcs are assigned to tc_group #1. If there is no zero bandwidth ets tc, the non zero bandwidth ets tcs are assigned to tc_group #0. Fixes: cdcf11212b22 ("net/mlx5e: Validate BW weight values of ETS") Signed-off-by: Huy Nguyen Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/port.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index c57d4b7de3a8..c59af8ab753a 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -157,6 +157,8 @@ int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc); int mlx5_query_port_prio_tc(struct mlx5_core_dev *mdev, u8 prio, u8 *tc); int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group); +int mlx5_query_port_tc_group(struct mlx5_core_dev *mdev, + u8 tc, u8 *tc_group); int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw); int mlx5_query_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 tc, u8 *bw_pct); -- cgit v1.2.3 From 06f877d613be3621604c2520ec0351d9fbdca15f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Oct 2017 08:20:31 -0700 Subject: tcp/dccp: fix other lockdep splats accessing ireq_opt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In my first attempt to fix the lockdep splat, I forgot we could enter inet_csk_route_req() with a freshly allocated request socket, for which refcount has not yet been elevated, due to complex SLAB_TYPESAFE_BY_RCU rules. We either are in rcu_read_lock() section _or_ we own a refcount on the request. Correct RCU verb to use here is rcu_dereference_check(), although it is not possible to prove we actually own a reference on a shared refcount :/ In v2, I added ireq_opt_deref() helper and use in three places, to fix other possible splats. [ 49.844590] lockdep_rcu_suspicious+0xea/0xf3 [ 49.846487] inet_csk_route_req+0x53/0x14d [ 49.848334] tcp_v4_route_req+0xe/0x10 [ 49.850174] tcp_conn_request+0x31c/0x6a0 [ 49.851992] ? __lock_acquire+0x614/0x822 [ 49.854015] tcp_v4_conn_request+0x5a/0x79 [ 49.855957] ? tcp_v4_conn_request+0x5a/0x79 [ 49.858052] tcp_rcv_state_process+0x98/0xdcc [ 49.859990] ? sk_filter_trim_cap+0x2f6/0x307 [ 49.862085] tcp_v4_do_rcv+0xfc/0x145 [ 49.864055] ? tcp_v4_do_rcv+0xfc/0x145 [ 49.866173] tcp_v4_rcv+0x5ab/0xaf9 [ 49.868029] ip_local_deliver_finish+0x1af/0x2e7 [ 49.870064] ip_local_deliver+0x1b2/0x1c5 [ 49.871775] ? inet_del_offload+0x45/0x45 [ 49.873916] ip_rcv_finish+0x3f7/0x471 [ 49.875476] ip_rcv+0x3f1/0x42f [ 49.876991] ? ip_local_deliver_finish+0x2e7/0x2e7 [ 49.878791] __netif_receive_skb_core+0x6d3/0x950 [ 49.880701] ? process_backlog+0x7e/0x216 [ 49.882589] __netif_receive_skb+0x1d/0x5e [ 49.884122] process_backlog+0x10c/0x216 [ 49.885812] net_rx_action+0x147/0x3df Fixes: a6ca7abe53633 ("tcp/dccp: fix lockdep splat in inet_csk_route_req()") Fixes: c92e8c02fe66 ("tcp/dccp: fix ireq->opt races") Signed-off-by: Eric Dumazet Reported-by: kernel test robot Reported-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- include/net/inet_sock.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 425752f768d2..db8162dd8c0b 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -132,6 +132,12 @@ static inline int inet_request_bound_dev_if(const struct sock *sk, return sk->sk_bound_dev_if; } +static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq) +{ + return rcu_dereference_check(ireq->ireq_opt, + refcount_read(&ireq->req.rsk_refcnt) > 0); +} + struct inet_cork { unsigned int flags; __be32 addr; -- cgit v1.2.3 From dea6e19f4ef746aa18b4c33d1a7fed54356796ed Mon Sep 17 00:00:00 2001 From: Girish Moodalbail Date: Fri, 27 Oct 2017 00:00:16 -0700 Subject: tap: reference to KVA of an unloaded module causes kernel panic The commit 9a393b5d5988 ("tap: tap as an independent module") created a separate tap module that implements tap functionality and exports interfaces that will be used by macvtap and ipvtap modules to create create respective tap devices. However, that patch introduced a regression wherein the modules macvtap and ipvtap can be removed (through modprobe -r) while there are applications using the respective /dev/tapX devices. These applications cause kernel to hold reference to /dev/tapX through 'struct cdev macvtap_cdev' and 'struct cdev ipvtap_dev' defined in macvtap and ipvtap modules respectively. So, when the application is later closed the kernel panics because we are referencing KVA that is present in the unloaded modules. ----------8<------- Example ----------8<---------- $ sudo ip li add name mv0 link enp7s0 type macvtap $ sudo ip li show mv0 |grep mv0| awk -e '{print $1 $2}' 14:mv0@enp7s0: $ cat /dev/tap14 & $ lsmod |egrep -i 'tap|vlan' macvtap 16384 0 macvlan 24576 1 macvtap tap 24576 3 macvtap $ sudo modprobe -r macvtap $ fg cat /dev/tap14 ^C <...system panics...> BUG: unable to handle kernel paging request at ffffffffa038c500 IP: cdev_put+0xf/0x30 ----------8<-----------------8<---------- The fix is to set cdev.owner to the module that creates the tap device (either macvtap or ipvtap). With this set, the operations (in fs/char_dev.c) on char device holds and releases the module through cdev_get() and cdev_put() and will not allow the module to unload prematurely. Fixes: 9a393b5d5988ea4e (tap: tap as an independent module) Signed-off-by: Girish Moodalbail Signed-off-by: David S. Miller --- include/linux/if_tap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h index 4837157da0dc..9ae41cdd0d4c 100644 --- a/include/linux/if_tap.h +++ b/include/linux/if_tap.h @@ -73,8 +73,8 @@ void tap_del_queues(struct tap_dev *tap); int tap_get_minor(dev_t major, struct tap_dev *tap); void tap_free_minor(dev_t major, struct tap_dev *tap); int tap_queue_resize(struct tap_dev *tap); -int tap_create_cdev(struct cdev *tap_cdev, - dev_t *tap_major, const char *device_name); +int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major, + const char *device_name, struct module *module); void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev); #endif /*_LINUX_IF_TAP_H_*/ -- cgit v1.2.3 From 8108a77515126f6db4374e8593956e20430307c0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 27 Oct 2017 09:45:34 -0700 Subject: bpf: bpf_compute_data uses incorrect cb structure SK_SKB program types use bpf_compute_data to store the end of the packet data. However, bpf_compute_data assumes the cb is stored in the qdisc layer format. But, for SK_SKB this is the wrong layer of the stack for this type. It happens to work (sort of!) because in most cases nothing happens to be overwritten today. This is very fragile and error prone. Fortunately, we have another hole in tcp_skb_cb we can use so lets put the data_end value there. Note, SK_SKB program types do not use data_meta, they are failed by sk_skb_is_valid_access(). Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index b1ef98ebce53..33599d17522d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -844,6 +844,7 @@ struct tcp_skb_cb { __u32 key; __u32 flags; struct bpf_map *map; + void *data_end; } bpf; }; }; -- cgit v1.2.3 From bfa640757e9378c2f26867e723f1287e94f5a7ad Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 27 Oct 2017 09:45:53 -0700 Subject: bpf: rename sk_actions to align with bpf infrastructure Recent additions to support multiple programs in cgroups impose a strict requirement, "all yes is yes, any no is no". To enforce this the infrastructure requires the 'no' return code, SK_DROP in this case, to be 0. To apply these rules to SK_SKB program types the sk_actions return codes need to be adjusted. This fix adds SK_PASS and makes 'SK_DROP = 0'. Finally, remove SK_ABORTED to remove any chance that the API may allow aborted program flows to be passed up the stack. This would be incorrect behavior and allow programs to break existing policies. Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f90860d1f897..0d7948ce2128 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -575,7 +575,7 @@ union bpf_attr { * @map: pointer to sockmap * @key: key to lookup sock in map * @flags: reserved for future use - * Return: SK_REDIRECT + * Return: SK_PASS * * int bpf_sock_map_update(skops, map, key, flags) * @skops: pointer to bpf_sock_ops @@ -786,8 +786,8 @@ struct xdp_md { }; enum sk_action { - SK_ABORTED = 0, - SK_DROP, + SK_DROP = 0, + SK_PASS, SK_REDIRECT, }; -- cgit v1.2.3 From 1da4fc97cbf89514e417a3df46eaec864a9b8a48 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 28 Oct 2017 19:43:54 +0800 Subject: sctp: fix some type cast warnings introduced by stream reconf These warnings were found by running 'make C=2 M=net/sctp/'. They are introduced by not aware of Endian when coding stream reconf patches. Since commit c0d8bab6ae51 ("sctp: add get and set sockopt for reconf_enable") enabled stream reconf feature for users, the Fixes tag below would use it. Fixes: c0d8bab6ae51 ("sctp: add get and set sockopt for reconf_enable") Reported-by: Eric Dumazet Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/sctp.h | 32 ++++++++++++++++---------------- include/net/sctp/sm.h | 2 +- include/net/sctp/ulpevent.h | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index 82b171e1aa0b..09d7412e9cb0 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -716,28 +716,28 @@ struct sctp_reconf_chunk { struct sctp_strreset_outreq { struct sctp_paramhdr param_hdr; - __u32 request_seq; - __u32 response_seq; - __u32 send_reset_at_tsn; - __u16 list_of_streams[0]; + __be32 request_seq; + __be32 response_seq; + __be32 send_reset_at_tsn; + __be16 list_of_streams[0]; }; struct sctp_strreset_inreq { struct sctp_paramhdr param_hdr; - __u32 request_seq; - __u16 list_of_streams[0]; + __be32 request_seq; + __be16 list_of_streams[0]; }; struct sctp_strreset_tsnreq { struct sctp_paramhdr param_hdr; - __u32 request_seq; + __be32 request_seq; }; struct sctp_strreset_addstrm { struct sctp_paramhdr param_hdr; - __u32 request_seq; - __u16 number_of_streams; - __u16 reserved; + __be32 request_seq; + __be16 number_of_streams; + __be16 reserved; }; enum { @@ -752,16 +752,16 @@ enum { struct sctp_strreset_resp { struct sctp_paramhdr param_hdr; - __u32 response_seq; - __u32 result; + __be32 response_seq; + __be32 result; }; struct sctp_strreset_resptsn { struct sctp_paramhdr param_hdr; - __u32 response_seq; - __u32 result; - __u32 senders_next_tsn; - __u32 receivers_next_tsn; + __be32 response_seq; + __be32 result; + __be32 senders_next_tsn; + __be32 receivers_next_tsn; }; #endif /* __LINUX_SCTP_H__ */ diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 2db3d3a9ce1d..88233cf8b8d4 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -261,7 +261,7 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, struct sctp_fwdtsn_skip *skiplist); struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc); struct sctp_chunk *sctp_make_strreset_req(const struct sctp_association *asoc, - __u16 stream_num, __u16 *stream_list, + __u16 stream_num, __be16 *stream_list, bool out, bool in); struct sctp_chunk *sctp_make_strreset_tsnreq( const struct sctp_association *asoc); diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index b8c86ec1a8f5..231dc42f1da6 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -130,7 +130,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_sender_dry_event( struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event( const struct sctp_association *asoc, __u16 flags, - __u16 stream_num, __u16 *stream_list, gfp_t gfp); + __u16 stream_num, __be16 *stream_list, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event( const struct sctp_association *asoc, __u16 flags, -- cgit v1.2.3 From 978aa0474115f3f5848949f2efce4def0766a5cb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 28 Oct 2017 19:43:57 +0800 Subject: sctp: fix some type cast warnings introduced since very beginning These warnings were found by running 'make C=2 M=net/sctp/'. They are there since very beginning. Note after this patch, there still one warning left in sctp_outq_flush(): sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM) Since it has been moved to sctp_stream_outq_migrate on net-next, to avoid the extra job when merging net-next to net, I will post the fix for it after the merging is done. Reported-by: Eric Dumazet Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/sctp.h | 2 +- include/uapi/linux/sctp.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index 09d7412e9cb0..da803dfc7a39 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -231,7 +231,7 @@ struct sctp_datahdr { __be32 tsn; __be16 stream; __be16 ssn; - __be32 ppid; + __u32 ppid; __u8 payload[0]; }; diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6217ff8500a1..84fc2914b7fb 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -376,7 +376,7 @@ struct sctp_remote_error { __u16 sre_type; __u16 sre_flags; __u32 sre_length; - __u16 sre_error; + __be16 sre_error; sctp_assoc_t sre_assoc_id; __u8 sre_data[0]; }; -- cgit v1.2.3 From 7aa0045dadb6ef37485ea9f2a7d28278ca588b51 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 26 Oct 2017 18:24:28 -0700 Subject: net_sched: introduce a workqueue for RCU callbacks of tc filter This patch introduces a dedicated workqueue for tc filters so that each tc filter's RCU callback could defer their action destroy work to this workqueue. The helper tcf_queue_work() is introduced for them to use. Because we hold RTNL lock when calling tcf_block_put(), we can not simply flush works inside it, therefore we have to defer it again to this workqueue and make sure all flying RCU callbacks have already queued their work before this one, in other words, to ensure this is the last one to execute to prevent any use-after-free. On the other hand, this makes tcf_block_put() ugly and harder to understand. Since David and Eric strongly dislike adding synchronize_rcu(), this is probably the only solution that could make everyone happy. Please also see the code comments below. Reported-by: Chris Mi Cc: Daniel Borkmann Cc: Jiri Pirko Cc: John Fastabend Cc: Jamal Hadi Salim Cc: "Paul E. McKenney" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 +++ include/net/sch_generic.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e80edd8879ef..3009547f3c66 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -2,6 +2,7 @@ #define __NET_PKT_CLS_H #include +#include #include #include @@ -17,6 +18,8 @@ struct tcf_walker { int register_tcf_proto_ops(struct tcf_proto_ops *ops); int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); +bool tcf_queue_work(struct work_struct *work); + #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 135f5a2dd931..0dec8a23be57 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -271,6 +272,7 @@ struct tcf_chain { struct tcf_block { struct list_head chain_list; + struct work_struct work; }; static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) -- cgit v1.2.3