From 23a1f8d44c0bca48f04fc2a2f1edafd826ce6133 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 8 Dec 2015 16:04:31 +0200 Subject: mac80211: process and save VHT MU-MIMO group frame The Group ID Management frame is an Action frame of category VHT. It is transmitted by the AP to assign or change the user position of a STA for one or more group IDs. Process and save the group membership data. Notify underlying driver of changes. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 7 +++++++ include/net/mac80211.h | 17 +++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 452c0b0d2f32..d9ddb89533a7 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -843,6 +843,8 @@ enum ieee80211_vht_opmode_bits { }; #define WLAN_SA_QUERY_TR_ID_LEN 2 +#define WLAN_MEMBERSHIP_LEN 8 +#define WLAN_USER_POSITION_LEN 16 /** * struct ieee80211_tpc_report_ie @@ -989,6 +991,11 @@ struct ieee80211_mgmt { u8 action_code; u8 operating_mode; } __packed vht_opmode_notif; + struct { + u8 action_code; + u8 membership[WLAN_MEMBERSHIP_LEN]; + u8 position[WLAN_USER_POSITION_LEN]; + } __packed vht_group_notif; struct { u8 action_code; u8 dialog_token; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7c30faff245f..8da483b2c067 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -298,6 +298,7 @@ struct ieee80211_vif_chanctx_switch { * note that this is only called when it changes after the channel * context had been assigned. * @BSS_CHANGED_OCB: OCB join status changed + * @BSS_CHANGED_MU_GROUPS: VHT MU-MIMO group id or user position changed */ enum ieee80211_bss_change { BSS_CHANGED_ASSOC = 1<<0, @@ -323,6 +324,7 @@ enum ieee80211_bss_change { BSS_CHANGED_BEACON_INFO = 1<<20, BSS_CHANGED_BANDWIDTH = 1<<21, BSS_CHANGED_OCB = 1<<22, + BSS_CHANGED_MU_GROUPS = 1<<23, /* when adding here, make sure to change ieee80211_reconfig */ }; @@ -435,6 +437,19 @@ struct ieee80211_event { } u; }; +/** + * struct ieee80211_mu_group_data - STA's VHT MU-MIMO group data + * + * This structure describes the group id data of VHT MU-MIMO + * + * @membership: 64 bits array - a bit is set if station is member of the group + * @position: 2 bits per group id indicating the position in the group + */ +struct ieee80211_mu_group_data { + u8 membership[WLAN_MEMBERSHIP_LEN]; + u8 position[WLAN_USER_POSITION_LEN]; +}; + /** * struct ieee80211_bss_conf - holds the BSS's changing parameters * @@ -477,6 +492,7 @@ struct ieee80211_event { * @enable_beacon: whether beaconing should be enabled or not * @chandef: Channel definition for this BSS -- the hardware might be * configured a higher bandwidth than this BSS uses, for example. + * @mu_group: VHT MU-MIMO group membership data * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation. * This field is only valid when the channel is a wide HT/VHT channel. * Note that with TDLS this can be the case (channel is HT, protection must @@ -535,6 +551,7 @@ struct ieee80211_bss_conf { s32 cqm_rssi_thold; u32 cqm_rssi_hyst; struct cfg80211_chan_def chandef; + struct ieee80211_mu_group_data mu_group; __be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; int arp_addr_cnt; bool qos; -- cgit v1.2.3 From f9cfa5f354b11e56cd8f019c12e14a42585586cd Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 8 Dec 2015 16:04:33 +0200 Subject: mac80211: add flag for duplication check Add an option for driver to check for packet duplication by itself. This is needed for example by the iwlwifi driver which parallelizes the RX path and does the duplication check per queue. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 8da483b2c067..ecab934dc8d9 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1063,7 +1063,7 @@ enum mac80211_rx_flags { RX_FLAG_HT_GF = BIT(13), RX_FLAG_AMPDU_DETAILS = BIT(14), RX_FLAG_PN_VALIDATED = BIT(15), - /* bit 16 free */ + RX_FLAG_DUP_VALIDATED = BIT(16), RX_FLAG_AMPDU_LAST_KNOWN = BIT(17), RX_FLAG_AMPDU_IS_LAST = BIT(18), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(19), -- cgit v1.2.3 From fad471860c097844432c7cf5d3ae6a0a059c2bdc Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 8 Dec 2015 16:04:34 +0200 Subject: mac80211: pass RX aggregation window size to driver Currently mac80211 does not inform the driver of the window size when starting an RX aggregation session. To enable managing the reorder buffer in the driver or hardware the window size is needed. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ecab934dc8d9..a990338a766e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3047,9 +3047,11 @@ enum ieee80211_reconfig_type { * ieee80211_ampdu_mlme_action. Starting sequence number (@ssn) * is the first frame we expect to perform the action on. Notice * that TX/RX_STOP can pass NULL for this parameter. - * The @buf_size parameter is only valid when the action is set to - * %IEEE80211_AMPDU_TX_OPERATIONAL and indicates the peer's reorder - * buffer size (number of subframes) for this session -- the driver + * The @buf_size parameter is valid only when the action is set to + * %IEEE80211_AMPDU_RX_START or %IEEE80211_AMPDU_TX_OPERATIONAL and + * indicates the reorder buffer size (number of subframes) for this + * session. + * When the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL the driver * may neither send aggregates containing more subframes than this * nor send aggregates in a way that lost frames would exceed the * buffer size. If just limiting the aggregate size, this would be -- cgit v1.2.3 From 4352a4d7f6bfd0aed0276a13fa4993db35714db4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 8 Dec 2015 16:04:35 +0200 Subject: mac80211: document status.freq restrictions It's not always necessary to set the status.freq field, for example when this would be an expensive calculation. It must be set for all management frames (as they might be reported to userspace), but for data frames it's not really required. Document this. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a990338a766e..bdee1cc19c7e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1108,6 +1108,8 @@ enum mac80211_rx_vht_flags { * it but can store it and pass it back to the driver for synchronisation * @band: the active band when this frame was received * @freq: frequency the radio was tuned to when receiving this frame, in MHz + * This field must be set for management frames, but isn't strictly needed + * for data (other) frames - for those it only affects radiotap reporting. * @signal: signal strength when receiving this frame, either in dBm, in dB or * unspecified depending on the hardware capabilities flags * @IEEE80211_HW_SIGNAL_* -- cgit v1.2.3 From 50ea05efaf3bed7dd34bcc2635a8b3f53bd0ccc1 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Wed, 30 Dec 2015 16:06:04 +0200 Subject: mac80211: pass block ack session timeout to to driver Currently mac80211 does not inform the driver of the session block ack timeout when starting a rx aggregation session. Drivers that manage the reorder buffer need to know this parameter. Seeing that there are now too many arguments for the drv_ampdu_action() function, wrap them inside a structure. Signed-off-by: Sara Sharon Signed-off-by: Johannes Berg --- include/net/mac80211.h | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index bdee1cc19c7e..6c9c559394b0 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2702,6 +2702,33 @@ enum ieee80211_ampdu_mlme_action { IEEE80211_AMPDU_TX_OPERATIONAL, }; +/** + * struct ieee80211_ampdu_params - AMPDU action parameters + * + * @action: the ampdu action, value from %ieee80211_ampdu_mlme_action. + * @sta: peer of this AMPDU session + * @tid: tid of the BA session + * @ssn: start sequence number of the session. TX/RX_STOP can pass 0. When + * action is set to %IEEE80211_AMPDU_RX_START the driver passes back the + * actual ssn value used to start the session and writes the value here. + * @buf_size: reorder buffer size (number of subframes). Valid only when the + * action is set to %IEEE80211_AMPDU_RX_START or + * %IEEE80211_AMPDU_TX_OPERATIONAL + * @amsdu: indicates the peer's ability to receive A-MSDU within A-MPDU. + * valid when the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL + * @timeout: BA session timeout. Valid only when the action is set to + * %IEEE80211_AMPDU_RX_START + */ +struct ieee80211_ampdu_params { + enum ieee80211_ampdu_mlme_action action; + struct ieee80211_sta *sta; + u16 tid; + u16 ssn; + u8 buf_size; + bool amsdu; + u16 timeout; +}; + /** * enum ieee80211_frame_release_type - frame release reason * @IEEE80211_FRAME_RELEASE_PSPOLL: frame released for PS-Poll @@ -3046,15 +3073,9 @@ enum ieee80211_reconfig_type { * @ampdu_action: Perform a certain A-MPDU action * The RA/TID combination determines the destination and TID we want * the ampdu action to be performed for. The action is defined through - * ieee80211_ampdu_mlme_action. Starting sequence number (@ssn) - * is the first frame we expect to perform the action on. Notice - * that TX/RX_STOP can pass NULL for this parameter. - * The @buf_size parameter is valid only when the action is set to - * %IEEE80211_AMPDU_RX_START or %IEEE80211_AMPDU_TX_OPERATIONAL and - * indicates the reorder buffer size (number of subframes) for this - * session. + * ieee80211_ampdu_mlme_action. * When the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL the driver - * may neither send aggregates containing more subframes than this + * may neither send aggregates containing more subframes than @buf_size * nor send aggregates in a way that lost frames would exceed the * buffer size. If just limiting the aggregate size, this would be * possible with a buf_size of 8: @@ -3065,9 +3086,6 @@ enum ieee80211_reconfig_type { * buffer size of 8. Correct ways to retransmit #1 would be: * - TX: 1 or 18 or 81 * Even "189" would be wrong since 1 could be lost again. - * The @amsdu parameter is valid when the action is set to - * %IEEE80211_AMPDU_TX_OPERATIONAL and indicates the peer's ability - * to receive A-MSDU within A-MPDU. * * Returns a negative error code on failure. * The callback can sleep. @@ -3409,9 +3427,7 @@ struct ieee80211_ops { int (*tx_last_beacon)(struct ieee80211_hw *hw); int (*ampdu_action)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - enum ieee80211_ampdu_mlme_action action, - struct ieee80211_sta *sta, u16 tid, u16 *ssn, - u8 buf_size, bool amsdu); + struct ieee80211_ampdu_params *params); int (*get_survey)(struct ieee80211_hw *hw, int idx, struct survey_info *survey); void (*rfkill_poll)(struct ieee80211_hw *hw); -- cgit v1.2.3 From 6e7333d315a768170a59ac771297ee0551bdddbf Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 1 Feb 2016 18:51:05 -0500 Subject: net: add rx_nohandler stat counter This adds an rx_nohandler stat counter, along with a sysfs statistics node, and copies the counter out via netlink as well. CC: "David S. Miller" CC: Eric Dumazet CC: Jiri Pirko CC: Daniel Borkmann CC: Tom Herbert CC: Jay Vosburgh CC: Veaceslav Falico CC: Andy Gospodarek CC: netdev@vger.kernel.org Signed-off-by: Jarod Wilson Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/uapi/linux/if_link.h | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 289c2314d766..78a20cec2a0a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1397,6 +1397,8 @@ enum netdev_priv_flags { * do not use this in drivers * @tx_dropped: Dropped packets by core network, * do not use this in drivers + * @rx_nohandler: nohandler dropped packets by core network on + * inactive devices, do not use this in drivers * * @wireless_handlers: List of functions to handle Wireless Extensions, * instead of ioctl, @@ -1611,6 +1613,7 @@ struct net_device { atomic_long_t rx_dropped; atomic_long_t tx_dropped; + atomic_long_t rx_nohandler; #ifdef CONFIG_WIRELESS_EXT const struct iw_handler_def * wireless_handlers; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index a30b78090594..d3e90b91e07e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -35,6 +35,8 @@ struct rtnl_link_stats { /* for cslip etc */ __u32 rx_compressed; __u32 tx_compressed; + + __u32 rx_nohandler; /* dropped, no handler found */ }; /* The main device statistics structure */ @@ -68,6 +70,8 @@ struct rtnl_link_stats64 { /* for cslip etc */ __u64 rx_compressed; __u64 tx_compressed; + + __u64 rx_nohandler; /* dropped, no handler found */ }; /* The struct should be in sync with struct ifmap */ -- cgit v1.2.3 From bb63daf9efb4f2bcb657d7179a53bd808f978dc9 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 1 Feb 2016 18:51:06 -0500 Subject: team: track sum of rx_nohandler for all slaves CC: Jiri Pirko CC: netdev@vger.kernel.org Signed-off-by: Jarod Wilson Signed-off-by: David S. Miller --- include/linux/if_team.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index b84e49c3a738..174f43f43aff 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -24,6 +24,7 @@ struct team_pcpu_stats { struct u64_stats_sync syncp; u32 rx_dropped; u32 tx_dropped; + u32 rx_nohandler; }; struct team; -- cgit v1.2.3 From 61d2bcae99f66a640b3dd9632180209143fb5512 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Feb 2016 21:03:07 -0800 Subject: tcp: fastopen: accept data/FIN present in SYNACK message RFC 7413 (TCP Fast Open) 4.2.2 states that the SYNACK message MAY include data and/or FIN This patch adds support for the client side : If we receive a SYNACK with payload or FIN, queue the skb instead of ignoring it. Since we already support the same for SYN, we refactor the existing code and reuse it. Note we need to clone the skb, so this operation might fail under memory pressure. Sara Dickinson pointed out FreeBSD server Fast Open implementation was planned to generate such SYNACK in the future. The server side might be implemented on linux later. Reported-by: Sara Dickinson Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index f6f8f032c73e..27f4c733116d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1437,6 +1437,7 @@ void tcp_free_fastopen_req(struct tcp_sock *tp); extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; int tcp_fastopen_reset_cipher(void *key, unsigned int len); +void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc, -- cgit v1.2.3 From ba905f5e2f63d86ed4cfbd3d9096fb28d156f1ee Mon Sep 17 00:00:00 2001 From: Kim Jones Date: Tue, 2 Feb 2016 03:51:16 +0000 Subject: ethtool: Declare netdev_rss_key as __read_mostly. netdev_rss_key is written to once and thereafter is read by drivers when they are initialising. The fact that it is mostly read and not written to makes it a candidate for a __read_mostly declaration. Signed-off-by: Kim Jones Signed-off-by: Alan Carey Acked-by: Rami Rosen Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 78a20cec2a0a..219f53c30cb3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3744,7 +3744,7 @@ void netdev_lower_state_changed(struct net_device *lower_dev, /* RSS keys are 40 or 52 bytes long */ #define NETDEV_RSS_KEY_LEN 52 -extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN]; +extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; void netdev_rss_key_fill(void *buffer, size_t len); int dev_get_nest_level(struct net_device *dev, -- cgit v1.2.3 From 824bd0ce6c7c43a9e1e210abf124958e54d88342 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:53 -0800 Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_HASH map Introduce BPF_MAP_TYPE_PERCPU_HASH map type which is used to do accurate counters without need to use BPF_XADD instruction which turned out to be too costly for high-performance network monitoring. In the typical use case the 'key' is the flow tuple or other long living object that sees a lot of events per second. bpf_map_lookup_elem() returns per-cpu area. Example: struct { u32 packets; u32 bytes; } * ptr = bpf_map_lookup_elem(&map, &key); /* ptr points to this_cpu area of the value, so the following * increments will not collide with other cpus */ ptr->packets ++; ptr->bytes += skb->len; bpf_update_elem() atomically creates a new element where all per-cpu values are zero initialized and this_cpu value is populated with given 'value'. Note that non-per-cpu hash map always allocates new element and then deletes old after rcu grace period to maintain atomicity of update. Per-cpu hash map updates element values in-place. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa6f8571de13..43ae40c8763e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -81,6 +81,7 @@ enum bpf_map_type { BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_PERF_EVENT_ARRAY, + BPF_MAP_TYPE_PERCPU_HASH, }; enum bpf_prog_type { -- cgit v1.2.3 From a10423b87a7eae75da79ce80a8d9475047a674ee Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:54 -0800 Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map Primary use case is a histogram array of latency where bpf program computes the latency of block requests or other events and stores histogram of latency into array of 64 elements. All cpus are constantly running, so normal increment is not accurate, bpf_xadd causes cache ping-pong and this per-cpu approach allows fastest collision-free counters. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 83d1926c61e4..141fb0d45731 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -151,6 +151,7 @@ struct bpf_array { union { char value[0] __aligned(8); void *ptrs[0] __aligned(8); + void __percpu *pptrs[0] __aligned(8); }; }; #define MAX_TAIL_CALL_CNT 32 diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 43ae40c8763e..2ee0fde1bf96 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -82,6 +82,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_MAP_TYPE_PERCPU_HASH, + BPF_MAP_TYPE_PERCPU_ARRAY, }; enum bpf_prog_type { -- cgit v1.2.3 From 15a07b33814d14ca817887dbea8530728dc0fbe4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Feb 2016 22:39:55 -0800 Subject: bpf: add lookup/update support for per-cpu hash and array maps The functions bpf_map_lookup_elem(map, key, value) and bpf_map_update_elem(map, key, value, flags) need to get/set values from all-cpus for per-cpu hash and array maps, so that user space can aggregate/update them as necessary. Example of single counter aggregation in user space: unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF); long values[nr_cpus]; long value = 0; bpf_lookup_elem(fd, key, values); for (i = 0; i < nr_cpus; i++) value += values[i]; The user space must provide round_up(value_size, 8) * nr_cpus array to get/set values, since kernel will use 'long' copy of per-cpu values to try to copy good counters atomically. It's a best-effort, since bpf programs and user space are racing to access the same memory. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 141fb0d45731..90ee6ab24bc5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -183,6 +183,29 @@ int bpf_prog_new_fd(struct bpf_prog *prog); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname); +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, + u64 flags); +int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, + u64 flags); + +/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and + * forced to use 'long' read/writes to try to atomically copy long counters. + * Best-effort only. No barriers here, since it _will_ race with concurrent + * updates from BPF programs. Called from bpf syscall and mostly used with + * size 8 or 16 bytes, so ask compiler to inline it. + */ +static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) +{ + const long *lsrc = src; + long *ldst = dst; + + size /= sizeof(long); + while (size--) + *ldst++ = *lsrc++; +} + /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); #else -- cgit v1.2.3 From 7267bcda332e2782e21a559f3b1b859a35b4062d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Sat, 16 Jan 2016 00:48:52 +0100 Subject: bcma: identify bus cores (devices) found on BCM47189 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add missing defines and print proper names. Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/bcma/bcma.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 3feb1b2d75d8..991ebb4c2015 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -151,6 +151,8 @@ struct bcma_host_ops { #define BCMA_CORE_PCIE2 0x83C /* PCI Express Gen2 */ #define BCMA_CORE_USB30_DEV 0x83D #define BCMA_CORE_ARM_CR4 0x83E +#define BCMA_CORE_GCI 0x840 +#define BCMA_CORE_CMEM 0x846 /* CNDS DDR2/3 memory controller */ #define BCMA_CORE_ARM_CA7 0x847 #define BCMA_CORE_SYS_MEM 0x849 #define BCMA_CORE_DEFAULT 0xFFF -- cgit v1.2.3 From 67edf354faaf93156646e741483b2313bc756c0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 19 Jan 2016 08:45:25 +0100 Subject: bcma: use _PMU_ in all names of PMU registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PMU (Power Management Unit) seems to be a separated piece of hardware, just accessed using ChipCommon core registers. In recent Broadcom chipsets PMU is not bounded to CC but available as separated core. To make code cleaner & easier to review (for a correct R/W access) use clearer names. Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/bcma/bcma_driver_chipcommon.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index db51a6ffb7d6..96d8d56f240f 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -351,12 +351,12 @@ #define BCMA_CC_PMU_RES_REQTS 0x0640 /* PMU res req timer sel */ #define BCMA_CC_PMU_RES_REQT 0x0644 /* PMU res req timer */ #define BCMA_CC_PMU_RES_REQM 0x0648 /* PMU res req mask */ -#define BCMA_CC_CHIPCTL_ADDR 0x0650 -#define BCMA_CC_CHIPCTL_DATA 0x0654 -#define BCMA_CC_REGCTL_ADDR 0x0658 -#define BCMA_CC_REGCTL_DATA 0x065C -#define BCMA_CC_PLLCTL_ADDR 0x0660 -#define BCMA_CC_PLLCTL_DATA 0x0664 +#define BCMA_CC_PMU_CHIPCTL_ADDR 0x0650 +#define BCMA_CC_PMU_CHIPCTL_DATA 0x0654 +#define BCMA_CC_PMU_REGCTL_ADDR 0x0658 +#define BCMA_CC_PMU_REGCTL_DATA 0x065C +#define BCMA_CC_PMU_PLLCTL_ADDR 0x0660 +#define BCMA_CC_PMU_PLLCTL_DATA 0x0664 #define BCMA_CC_PMU_STRAPOPT 0x0668 /* (corerev >= 28) */ #define BCMA_CC_PMU_XTAL_FREQ 0x066C /* (pmurev >= 10) */ #define BCMA_CC_PMU_XTAL_FREQ_ILPCTL_MASK 0x00001FFF -- cgit v1.2.3 From b3c47afbf54d86daa0473895e8ca9e8b663f5c1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 19 Jan 2016 08:45:26 +0100 Subject: bcma: support PMU present as separated bus core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On recent Broadcom chipsets PMU is present as separated core and it can't be accessed using ChipCommon anymore as it fails with e.g.: [ 0.000577] Unhandled fault: external abort on non-linefetch (0x1008) at 0xf1000604 Solve it by using a new (PMU) core pointer set to ChipCommon or PMU depending on the hardware capabilities. Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/bcma/bcma_driver_chipcommon.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index 96d8d56f240f..700d0c6f7480 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -217,6 +217,11 @@ #define BCMA_CC_CLKDIV_JTAG_SHIFT 8 #define BCMA_CC_CLKDIV_UART 0x000000FF #define BCMA_CC_CAP_EXT 0x00AC /* Capabilities */ +#define BCMA_CC_CAP_EXT_SECI_PRESENT 0x00000001 +#define BCMA_CC_CAP_EXT_GSIO_PRESENT 0x00000002 +#define BCMA_CC_CAP_EXT_GCI_PRESENT 0x00000004 +#define BCMA_CC_CAP_EXT_SECI_PUART_PRESENT 0x00000008 /* UART present */ +#define BCMA_CC_CAP_EXT_AOB_PRESENT 0x00000040 #define BCMA_CC_PLLONDELAY 0x00B0 /* Rev >= 4 only */ #define BCMA_CC_FREFSELDELAY 0x00B4 /* Rev >= 4 only */ #define BCMA_CC_SLOWCLKCTL 0x00B8 /* 6 <= Rev <= 9 only */ @@ -566,6 +571,7 @@ * Check availability with ((struct bcma_chipcommon)->capabilities & BCMA_CC_CAP_PMU) */ struct bcma_chipcommon_pmu { + struct bcma_device *core; /* Can be separated core or just ChipCommon one */ u8 rev; /* PMU revision */ u32 crystalfreq; /* The active crystal frequency (in kHz) */ }; @@ -660,6 +666,19 @@ struct bcma_drv_cc_b { #define bcma_cc_maskset32(cc, offset, mask, set) \ bcma_cc_write32(cc, offset, (bcma_cc_read32(cc, offset) & (mask)) | (set)) +/* PMU registers access */ +#define bcma_pmu_read32(cc, offset) \ + bcma_read32((cc)->pmu.core, offset) +#define bcma_pmu_write32(cc, offset, val) \ + bcma_write32((cc)->pmu.core, offset, val) + +#define bcma_pmu_mask32(cc, offset, mask) \ + bcma_pmu_write32(cc, offset, bcma_pmu_read32(cc, offset) & (mask)) +#define bcma_pmu_set32(cc, offset, set) \ + bcma_pmu_write32(cc, offset, bcma_pmu_read32(cc, offset) | (set)) +#define bcma_pmu_maskset32(cc, offset, mask, set) \ + bcma_pmu_write32(cc, offset, (bcma_pmu_read32(cc, offset) & (mask)) | (set)) + extern u32 bcma_chipco_watchdog_timer_set(struct bcma_drv_cc *cc, u32 ticks); extern u32 bcma_chipco_get_alp_clock(struct bcma_drv_cc *cc); -- cgit v1.2.3 From 61dba73cdbba8ec5c01b31beaf9e2debc2d2f273 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Sun, 24 Jan 2016 16:37:33 +0100 Subject: bcma: add support for BCM47094 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's another SoC with 32 GPIOs and simplified watchdog handling. It was tested on D-Link DIR-885L. Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/bcma/bcma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 991ebb4c2015..0367c63f5960 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -201,6 +201,7 @@ struct bcma_host_ops { #define BCMA_PKG_ID_BCM4707 1 #define BCMA_PKG_ID_BCM4708 2 #define BCMA_PKG_ID_BCM4709 0 +#define BCMA_CHIP_ID_BCM47094 53030 #define BCMA_CHIP_ID_BCM53018 53018 /* Board types (on PCI usually equals to the subsystem dev id) */ -- cgit v1.2.3 From e3e17b773bfe45462b7f3fae20c550025975cb13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 6 Feb 2016 11:16:28 -0800 Subject: tcp: fastopen: call tcp_fin() if FIN present in SYNACK When we acknowledge a FIN, it is not enough to ack the sequence number and queue the skb into receive queue. We also have to call tcp_fin() to properly update socket state and send proper poll() notifications. It seems we also had the problem if we received a SYN packet with the FIN flag set, but it does not seem an urgent issue, as no known implementation can do that. Fixes: 61d2bcae99f6 ("tcp: fastopen: accept data/FIN present in SYNACK message") Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 27f4c733116d..479d535609fd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -568,6 +568,7 @@ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_reset(struct sock *sk); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); +void tcp_fin(struct sock *sk); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); -- cgit v1.2.3 From 0e715d6fbd2a4a1dcd215d6d51091346e6a3d3fa Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 2 Feb 2016 18:09:11 +0100 Subject: vxlan: cleanup types include/net/vxlan.h is a kernel header, no need to prefix fixed size types with double underscore. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/vxlan.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 0fb86442544b..5c64250619c5 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -30,15 +30,15 @@ * [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy */ struct vxlanhdr_gbp { - __u8 vx_flags; + u8 vx_flags; #ifdef __LITTLE_ENDIAN_BITFIELD - __u8 reserved_flags1:3, + u8 reserved_flags1:3, policy_applied:1, reserved_flags2:2, dont_learn:1, reserved_flags3:1; #elif defined(__BIG_ENDIAN_BITFIELD) - __u8 reserved_flags1:1, + u8 reserved_flags1:1, dont_learn:1, reserved_flags2:2, policy_applied:1, @@ -138,10 +138,10 @@ struct vxlan_config { int remote_ifindex; int mtu; __be16 dst_port; - __u16 port_min; - __u16 port_max; - __u8 tos; - __u8 ttl; + u16 port_min; + u16 port_max; + u8 tos; + u8 ttl; u32 flags; unsigned long age_interval; unsigned int addrmax; -- cgit v1.2.3 From 427bc465bf9fcdab749f6997ff7a4eecaef4ca40 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 2 Feb 2016 18:09:12 +0100 Subject: vxlan: remove duplicated macros VNI_HASH_BITS and VNI_HASH_SIZE are defined twice. Remove the extra definitions. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/vxlan.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 5c64250619c5..234bf1ef2737 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -9,9 +9,6 @@ #include #include -#define VNI_HASH_BITS 10 -#define VNI_HASH_SIZE (1< Date: Tue, 2 Feb 2016 18:09:13 +0100 Subject: vxlan: restructure vxlan.h definitions RCO and GBP are VXLAN extensions, not specified in RFC 7348. Because of that, they need to be explicitly enabled when creating vxlan interface. By default, those extensions are not used and plain VXLAN header is sent and received. Reflect this in vxlan.h: first, the plain VXLAN header is defined. Following it, RCO is documented and defined, and likewise for GBP. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/vxlan.h | 104 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 234bf1ef2737..25bd919c9ef0 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -9,14 +9,71 @@ #include #include +/* VXLAN protocol (RFC 7348) header: + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |R|R|R|R|I|R|R|R| Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | VXLAN Network Identifier (VNI) | Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * I = VXLAN Network Identifier (VNI) present. + */ +struct vxlanhdr { + __be32 vx_flags; + __be32 vx_vni; +}; + +/* VXLAN header flags. */ +#define VXLAN_HF_VNI BIT(27) + +#define VXLAN_N_VID (1u << 24) +#define VXLAN_VID_MASK (VXLAN_N_VID - 1) +#define VXLAN_VNI_MASK (VXLAN_VID_MASK << 8) +#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) + +#define VNI_HASH_BITS 10 +#define VNI_HASH_SIZE (1<mark mapping @@ -59,44 +119,6 @@ struct vxlanhdr_gbp { #define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16) #define VXLAN_GBP_ID_MASK (0xFFFF) -/* VXLAN protocol header: - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |G|R|R|R|I|R|R|C| Reserved | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | VXLAN Network Identifier (VNI) | Reserved | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * G = 1 Group Policy (VXLAN-GBP) - * I = 1 VXLAN Network Identifier (VNI) present - * C = 1 Remote checksum offload (RCO) - */ -struct vxlanhdr { - __be32 vx_flags; - __be32 vx_vni; -}; - -/* VXLAN header flags. */ -#define VXLAN_HF_RCO BIT(21) -#define VXLAN_HF_VNI BIT(27) -#define VXLAN_HF_GBP BIT(31) - -/* Remote checksum offload header option */ -#define VXLAN_RCO_MASK 0x7f /* Last byte of vni field */ -#define VXLAN_RCO_UDP 0x80 /* Indicate UDP RCO (TCP when not set *) */ -#define VXLAN_RCO_SHIFT 1 /* Left shift of start */ -#define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1) -#define VXLAN_MAX_REMCSUM_START (VXLAN_RCO_MASK << VXLAN_RCO_SHIFT) - -#define VXLAN_N_VID (1u << 24) -#define VXLAN_VID_MASK (VXLAN_N_VID - 1) -#define VXLAN_VNI_MASK (VXLAN_VID_MASK << 8) -#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) - -#define VNI_HASH_BITS 10 -#define VNI_HASH_SIZE (1< Date: Tue, 2 Feb 2016 07:43:45 -0800 Subject: net: Add support for fill_slave_info to VRF device Allows userspace to have direct access to VRF table association versus looking up master device and its table. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index d3e90b91e07e..d452cea59020 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -405,6 +405,14 @@ enum { #define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1) +enum { + IFLA_VRF_PORT_UNSPEC, + IFLA_VRF_PORT_TABLE, + __IFLA_VRF_PORT_MAX +}; + +#define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1) + /* IPVLAN section */ enum { IFLA_IPVLAN_UNSPEC, -- cgit v1.2.3 From ddf1af6fa00e772fdb67a7d22cb83fac2b8968a8 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 2 Feb 2016 10:33:06 -0800 Subject: tcp: new delivery accounting This patch changes the accounting of how many packets are newly acked or sacked when the sender receives an ACK. The current approach basically computes newly_acked_sacked = (prior_packets - prior_sacked) - (tp->packets_out - tp->sacked_out) where prior_packets and prior_sacked out are snapshot at the beginning of the ACK processing. The new approach tracks the delivery information via a new TCP state variable "delivered" which monotically increases as new packets are delivered in order or out-of-order. The reason for this change is that the current approach is brittle that produces negative or inaccurate estimate. 1) For non-SACK connections, an ACK that advances the SND.UNA could reset the DUPACK counters (tp->sacked_out) in tcp_process_loss() or tcp_fastretrans_alert(). This inflates the inflight suddenly and causes under-estimate or even negative estimate. Here is a real example: before after (processing ACK) packets_out 75 73 sacked_out 23 0 ca state Loss Open The old approach computes (75-23) - (73 - 0) = -21 delivered while the new approach computes 1 delivered since it considers the 2nd-24th packets are delivered OOO. 2) MSS change would re-count packets_out and sacked_out so the estimate is in-accurate and can even become negative. E.g., the inflight is doubled when MSS is halved. 3) Spurious retransmission signaled by DSACK is not accounted The new approach is simpler and more robust. For SACK connections, tp->delivered increments as packets are being acked or sacked in SACK and ACK processing. For non-sack connections, it's done in tcp_remove_reno_sacks() and tcp_add_reno_sack(). When an ACK advances the SND.UNA, tp->delivered is incremented by the number of packets ACKed (less the current number of DUPACKs received plus one packet hole). Upon receiving a DUPACK, tp->delivered is incremented assuming one out-of-order packet is delivered. Upon receiving a DSACK, tp->delivered is incremtened assuming one retransmission is delivered in tcp_sacktag_write_queue(). Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b386361ba3e8..d909feeeaea2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -256,6 +256,7 @@ struct tcp_sock { u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ u32 prr_out; /* Total number of pkts sent during Recovery. */ + u32 delivered; /* Total data packets delivered incl. rexmits */ u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ -- cgit v1.2.3 From 46fcc6ef9d39eb7b1becaa5ef5cba64d230f7c3f Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 2 Feb 2016 10:41:55 -0800 Subject: sunvnet: Add support for perf LDC event tracing Add perf event macros for support of tracing and instrumentation of LDC state machine Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- include/trace/events/sunvnet.h | 139 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 include/trace/events/sunvnet.h (limited to 'include') diff --git a/include/trace/events/sunvnet.h b/include/trace/events/sunvnet.h new file mode 100644 index 000000000000..eb080b267e55 --- /dev/null +++ b/include/trace/events/sunvnet.h @@ -0,0 +1,139 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sunvnet + +#if !defined(_TRACE_SUNVNET_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SUNVNET_H + +#include + +TRACE_EVENT(vnet_rx_one, + + TP_PROTO(int lsid, int rsid, int index, int needs_ack), + + TP_ARGS(lsid, rsid, index, needs_ack), + + TP_STRUCT__entry( + __field(int, lsid) + __field(int, rsid) + __field(int, index) + __field(int, needs_ack) + ), + + TP_fast_assign( + __entry->lsid = lsid; + __entry->rsid = rsid; + __entry->index = index; + __entry->needs_ack = needs_ack; + ), + + TP_printk("(%x:%x) walk_rx_one index %d; needs_ack %d", + __entry->lsid, __entry->rsid, + __entry->index, __entry->needs_ack) +); + +DECLARE_EVENT_CLASS(vnet_tx_stopped_ack_template, + + TP_PROTO(int lsid, int rsid, int ack_end, int npkts), + + TP_ARGS(lsid, rsid, ack_end, npkts), + + TP_STRUCT__entry( + __field(int, lsid) + __field(int, rsid) + __field(int, ack_end) + __field(int, npkts) + ), + + TP_fast_assign( + __entry->lsid = lsid; + __entry->rsid = rsid; + __entry->ack_end = ack_end; + __entry->npkts = npkts; + ), + + TP_printk("(%x:%x) stopped ack for %d; npkts %d", + __entry->lsid, __entry->rsid, + __entry->ack_end, __entry->npkts) +); +DEFINE_EVENT(vnet_tx_stopped_ack_template, vnet_tx_send_stopped_ack, + TP_PROTO(int lsid, int rsid, int ack_end, int npkts), + TP_ARGS(lsid, rsid, ack_end, npkts)); +DEFINE_EVENT(vnet_tx_stopped_ack_template, vnet_tx_defer_stopped_ack, + TP_PROTO(int lsid, int rsid, int ack_end, int npkts), + TP_ARGS(lsid, rsid, ack_end, npkts)); +DEFINE_EVENT(vnet_tx_stopped_ack_template, vnet_tx_pending_stopped_ack, + TP_PROTO(int lsid, int rsid, int ack_end, int npkts), + TP_ARGS(lsid, rsid, ack_end, npkts)); + +TRACE_EVENT(vnet_rx_stopped_ack, + + TP_PROTO(int lsid, int rsid, int end), + + TP_ARGS(lsid, rsid, end), + + TP_STRUCT__entry( + __field(int, lsid) + __field(int, rsid) + __field(int, end) + ), + + TP_fast_assign( + __entry->lsid = lsid; + __entry->rsid = rsid; + __entry->end = end; + ), + + TP_printk("(%x:%x) stopped ack for index %d", + __entry->lsid, __entry->rsid, __entry->end) +); + +TRACE_EVENT(vnet_tx_trigger, + + TP_PROTO(int lsid, int rsid, int start, int err), + + TP_ARGS(lsid, rsid, start, err), + + TP_STRUCT__entry( + __field(int, lsid) + __field(int, rsid) + __field(int, start) + __field(int, err) + ), + + TP_fast_assign( + __entry->lsid = lsid; + __entry->rsid = rsid; + __entry->start = start; + __entry->err = err; + ), + + TP_printk("(%x:%x) Tx trigger for %d sent with err %d %s", + __entry->lsid, __entry->rsid, __entry->start, + __entry->err, __entry->err > 0 ? "(ok)" : " ") +); + +TRACE_EVENT(vnet_skip_tx_trigger, + + TP_PROTO(int lsid, int rsid, int last), + + TP_ARGS(lsid, rsid, last), + + TP_STRUCT__entry( + __field(int, lsid) + __field(int, rsid) + __field(int, last) + ), + + TP_fast_assign( + __entry->lsid = lsid; + __entry->rsid = rsid; + __entry->last = last; + ), + + TP_printk("(%x:%x) Skip Tx trigger. Last trigger sent was %d", + __entry->lsid, __entry->rsid, __entry->last) +); +#endif /* _TRACE_SOCK_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 103a8ad1fa3b261c78dfc842cb315defe9d40be0 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 3 Feb 2016 04:04:36 +0100 Subject: ethtool: add speed/duplex validation functions Add functions which check if the speed/duplex are defined. Signed-off-by: Nikolay Aleksandrov Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 57fa39005e79..b2e180181629 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1319,11 +1319,45 @@ enum ethtool_sfeatures_retval_bits { #define SPEED_UNKNOWN -1 +static inline int ethtool_validate_speed(__u32 speed) +{ + switch (speed) { + case SPEED_10: + case SPEED_100: + case SPEED_1000: + case SPEED_2500: + case SPEED_5000: + case SPEED_10000: + case SPEED_20000: + case SPEED_25000: + case SPEED_40000: + case SPEED_50000: + case SPEED_56000: + case SPEED_100000: + case SPEED_UNKNOWN: + return 1; + } + + return 0; +} + /* Duplex, half or full. */ #define DUPLEX_HALF 0x00 #define DUPLEX_FULL 0x01 #define DUPLEX_UNKNOWN 0xff +static inline int ethtool_validate_duplex(__u8 duplex) +{ + switch (duplex) { + case DUPLEX_HALF: + case DUPLEX_FULL: + case DUPLEX_UNKNOWN: + return 1; + } + + return 0; +} + /* Which connector port. */ #define PORT_TP 0x00 #define PORT_AUI 0x01 -- cgit v1.2.3 From 6fa251663069e05daadd1666cbf3b658bf840ea4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:49 +0200 Subject: ipv4: Namespaceify tcp syn retries sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 ++ include/net/tcp.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2b7907a35568..b7b5bd64df35 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -98,6 +98,8 @@ struct netns_ipv4 { int sysctl_tcp_keepalive_probes; int sysctl_tcp_keepalive_intvl; + int sysctl_tcp_syn_retries; + struct ping_group_range ping_group_range; atomic_t dev_addr_genid; diff --git a/include/net/tcp.h b/include/net/tcp.h index 479d535609fd..825485c7cc1a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_syn_retries; extern int sysctl_tcp_synack_retries; extern int sysctl_tcp_retries1; extern int sysctl_tcp_retries2; -- cgit v1.2.3 From 7c083ecb3ba4583a625d5ff9655d1a819e374493 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:50 +0200 Subject: ipv4: Namespaceify tcp synack retries sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index b7b5bd64df35..9e83084ab8c1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -99,6 +99,7 @@ struct netns_ipv4 { int sysctl_tcp_keepalive_intvl; int sysctl_tcp_syn_retries; + int sysctl_tcp_synack_retries; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index 825485c7cc1a..05659e860039 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_synack_retries; extern int sysctl_tcp_retries1; extern int sysctl_tcp_retries2; extern int sysctl_tcp_orphan_retries; -- cgit v1.2.3 From 12ed8244ed8b31b023ea6d2851fd8b15f2999e9b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:51 +0200 Subject: ipv4: Namespaceify tcp syncookies sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 ++ include/net/tcp.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9e83084ab8c1..ac000fccdf0f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -101,6 +101,8 @@ struct netns_ipv4 { int sysctl_tcp_syn_retries; int sysctl_tcp_synack_retries; + int sysctl_tcp_syncookies; + struct ping_group_range ping_group_range; atomic_t dev_addr_genid; diff --git a/include/net/tcp.h b/include/net/tcp.h index 05659e860039..1fb23b70d237 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -243,7 +243,6 @@ extern int sysctl_tcp_fin_timeout; extern int sysctl_tcp_retries1; extern int sysctl_tcp_retries2; extern int sysctl_tcp_orphan_retries; -extern int sysctl_tcp_syncookies; extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; -- cgit v1.2.3 From 1043e25ff96a1efc7bd34d11f5f32203a28a3bd7 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:52 +0200 Subject: ipv4: Namespaceify tcp reordering sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- include/net/tcp.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index ac000fccdf0f..eb4cd0a3c296 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -100,8 +100,8 @@ struct netns_ipv4 { int sysctl_tcp_syn_retries; int sysctl_tcp_synack_retries; - int sysctl_tcp_syncookies; + int sysctl_tcp_reordering; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index 1fb23b70d237..7e9a147cabae 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -961,9 +961,11 @@ static inline void tcp_enable_fack(struct tcp_sock *tp) */ static inline void tcp_enable_early_retrans(struct tcp_sock *tp) { + struct net *net = sock_net((struct sock *)tp); + tp->do_early_retrans = sysctl_tcp_early_retrans && sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack && - sysctl_tcp_reordering == 3; + net->ipv4.sysctl_tcp_reordering == 3; } static inline void tcp_disable_early_retrans(struct tcp_sock *tp) -- cgit v1.2.3 From ae5c3f406cffe15ffd2aa544961b7cd027468d46 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:53 +0200 Subject: ipv4: Namespaceify tcp_retries1 sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index eb4cd0a3c296..dee6ba647461 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -102,6 +102,7 @@ struct netns_ipv4 { int sysctl_tcp_synack_retries; int sysctl_tcp_syncookies; int sysctl_tcp_reordering; + int sysctl_tcp_retries1; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index 7e9a147cabae..da96b9af3e5f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_retries1; extern int sysctl_tcp_retries2; extern int sysctl_tcp_orphan_retries; extern int sysctl_tcp_fastopen; -- cgit v1.2.3 From c6214a97c86c660de4f7ddb8eed925192e646161 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:54 +0200 Subject: ipv4: Namespaceify tcp_retries2 sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index dee6ba647461..d92c8e5d0fbc 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -103,6 +103,7 @@ struct netns_ipv4 { int sysctl_tcp_syncookies; int sysctl_tcp_reordering; int sysctl_tcp_retries1; + int sysctl_tcp_retries2; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index da96b9af3e5f..a786cfa6301b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_retries2; extern int sysctl_tcp_orphan_retries; extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; -- cgit v1.2.3 From c402d9beffb6141ab2e4d2ad8be71128803a28ca Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:55 +0200 Subject: ipv4: Namespaceify tcp_orphan_retries sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d92c8e5d0fbc..080230321985 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -104,6 +104,7 @@ struct netns_ipv4 { int sysctl_tcp_reordering; int sysctl_tcp_retries1; int sysctl_tcp_retries2; + int sysctl_tcp_orphan_retries; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index a786cfa6301b..71f840b89c76 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_orphan_retries; extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; -- cgit v1.2.3 From 1e579caa18b96f9eb18f4f5416658cd15f37c062 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:56 +0200 Subject: ipv4: Namespaceify tcp_fin_timeout sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 080230321985..de5ff4385e84 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -105,6 +105,7 @@ struct netns_ipv4 { int sysctl_tcp_retries1; int sysctl_tcp_retries2; int sysctl_tcp_orphan_retries; + int sysctl_tcp_fin_timeout; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index 71f840b89c76..3f160c2e6960 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -239,7 +239,6 @@ extern struct inet_timewait_death_row tcp_death_row; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; -extern int sysctl_tcp_fin_timeout; extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; @@ -1249,7 +1248,7 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) static inline int tcp_fin_time(const struct sock *sk) { - int fin_timeout = tcp_sk(sk)->linger2 ? : sysctl_tcp_fin_timeout; + int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout; const int rto = inet_csk(sk)->icsk_rto; if (fin_timeout < (rto << 2) - (rto >> 1)) -- cgit v1.2.3 From 4979f2d9f7262b9b180bc83de8d70f7a7721c085 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:57 +0200 Subject: ipv4: Namespaceify tcp_notsent_lowat sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index de5ff4385e84..4d6ec3f6fafe 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -106,6 +106,7 @@ struct netns_ipv4 { int sysctl_tcp_retries2; int sysctl_tcp_orphan_retries; int sysctl_tcp_fin_timeout; + unsigned int sysctl_tcp_notsent_lowat; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index 3f160c2e6960..9b2cb0c8d876 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -267,7 +267,6 @@ extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; -extern unsigned int sysctl_tcp_notsent_lowat; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_min_rtt_wlen; extern int sysctl_tcp_autocorking; @@ -1682,7 +1681,8 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { - return tp->notsent_lowat ?: sysctl_tcp_notsent_lowat; + struct net *net = sock_net((struct sock *)tp); + return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat; } static inline bool tcp_stream_memory_free(const struct sock *sk) -- cgit v1.2.3 From 157ede6784ba2837c7dc43f195418c75927f8488 Mon Sep 17 00:00:00 2001 From: Elad Raz Date: Wed, 3 Feb 2016 09:57:04 +0100 Subject: bridge: mdb: add support for offloaded mdb entries Add new bitmask member 'flags' to br_mdb_entry structure. Adding MDB_FLAGS_OFFLOAD bit which indicates MDB entries is offloaded to hardware. Signed-off-by: Elad Raz Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 18db14477bdd..ec3547234998 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -183,6 +183,8 @@ struct br_mdb_entry { #define MDB_TEMPORARY 0 #define MDB_PERMANENT 1 __u8 state; +#define MDB_FLAGS_OFFLOAD (1 << 0) + __u8 flags; __u16 vid; struct { union { -- cgit v1.2.3 From 5ee14e6d336f1daacf5ba73e831029c5ab7ae329 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 3 Feb 2016 13:17:01 +0100 Subject: bonding: 3ad: apply ad_actor settings changes immediately Currently the bonding allows to set ad_actor_system and prio while the bond device is down, but these are actually applied only if there aren't any slaves yet (applied to bond device when first slave shows up, and to slaves at 3ad bind time). After this patch changes are applied immediately and the new values can be used/seen after the bond's upped so it's not necessary anymore to release all and enslave again to see the changes. CC: Jay Vosburgh CC: Veaceslav Falico CC: Andy Gospodarek Signed-off-by: Nikolay Aleksandrov Signed-off-by: Jay Vosburgh Signed-off-by: David S. Miller --- include/net/bond_3ad.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index f1fbc3b11962..f358ad5e4214 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -306,5 +306,6 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); int bond_3ad_set_carrier(struct bonding *bond); void bond_3ad_update_lacp_rate(struct bonding *bond); +void bond_3ad_update_ad_actor_settings(struct bonding *bond); #endif /* _NET_BOND_3AD_H */ -- cgit v1.2.3 From 086c653f5862591a9cfe2386f5650d03adacc33a Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 10 Feb 2016 11:50:35 -0500 Subject: sock: struct proto hash function may error In order to support fast reuseport lookups in TCP, the hash function defined in struct proto must be capable of returning an error code. This patch changes the function signature of all related hash functions to return an integer and handles or propagates this return value at all call sites. Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 2 +- include/net/phonet/phonet.h | 2 +- include/net/ping.h | 2 +- include/net/raw.h | 2 +- include/net/sock.h | 6 +++--- include/net/udp.h | 3 ++- 6 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index de2e3ade6102..554440e7f83d 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -208,7 +208,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk); void __inet_hash(struct sock *sk, struct sock *osk); -void inet_hash(struct sock *sk); +int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(struct net *net, diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h index 68e509750caa..039cc29cb4a8 100644 --- a/include/net/phonet/phonet.h +++ b/include/net/phonet/phonet.h @@ -51,7 +51,7 @@ void pn_sock_init(void); struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *sa); void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb); void phonet_get_local_port_range(int *min, int *max); -void pn_sock_hash(struct sock *sk); +int pn_sock_hash(struct sock *sk); void pn_sock_unhash(struct sock *sk); int pn_sock_get_port(struct sock *sk, unsigned short sport); diff --git a/include/net/ping.h b/include/net/ping.h index ac80cb45e630..5fd7cc244833 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -65,7 +65,7 @@ struct pingfakehdr { }; int ping_get_port(struct sock *sk, unsigned short ident); -void ping_hash(struct sock *sk); +int ping_hash(struct sock *sk); void ping_unhash(struct sock *sk); int ping_init_sock(struct sock *sk); diff --git a/include/net/raw.h b/include/net/raw.h index 6a40c6562dd2..3e789008394d 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -57,7 +57,7 @@ int raw_seq_open(struct inode *ino, struct file *file, #endif -void raw_hash_sk(struct sock *sk); +int raw_hash_sk(struct sock *sk); void raw_unhash_sk(struct sock *sk); struct raw_sock { diff --git a/include/net/sock.h b/include/net/sock.h index f5ea148853e2..255d3e03727b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -984,7 +984,7 @@ struct proto { void (*release_cb)(struct sock *sk); /* Keeping track of sk's, looking them up, and port selection methods. */ - void (*hash)(struct sock *sk); + int (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); @@ -1194,10 +1194,10 @@ static inline void sock_prot_inuse_add(struct net *net, struct proto *prot, /* With per-bucket locks this operation is not-atomic, so that * this version is not worse. */ -static inline void __sk_prot_rehash(struct sock *sk) +static inline int __sk_prot_rehash(struct sock *sk) { sk->sk_prot->unhash(sk); - sk->sk_prot->hash(sk); + return sk->sk_prot->hash(sk); } void sk_prot_clear_portaddr_nulls(struct sock *sk, int size); diff --git a/include/net/udp.h b/include/net/udp.h index 2842541e28e7..92927f729ac8 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -177,9 +177,10 @@ static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) } /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ -static inline void udp_lib_hash(struct sock *sk) +static inline int udp_lib_hash(struct sock *sk) { BUG(); + return 0; } void udp_lib_unhash(struct sock *sk); -- cgit v1.2.3 From 496611d7b5eaf59c03440c8f2def1d9988ad2459 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 10 Feb 2016 11:50:36 -0500 Subject: inet: create IPv6-equivalent inet_hash function In order to support fast lookups for TCP sockets with SO_REUSEPORT, the function that adds sockets to the listening hash set needs to be able to check receive address equality. Since this equality check is different for IPv4 and IPv6, we will need two different socket hashing functions. This patch adds inet6_hash identical to the existing inet_hash function and updates the appropriate references. A following patch will differentiate the two by passing different comparison functions to __inet_hash. Additionally, in order to use the IPv6 address equality function from inet6_hashtables (which is compiled as a built-in object when IPv6 is enabled) it also needs to be in a built-in object file as well. This moves ipv6_rcv_saddr_equal into inet_hashtables to accomplish this. Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- include/net/inet6_hashtables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 7ff588ca6817..b3c28a9dfbf1 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -96,6 +96,8 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif); + +int inet6_hash(struct sock *sk); #endif /* IS_ENABLED(CONFIG_IPV6) */ #define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif) \ -- cgit v1.2.3 From d9b3fca27385eafe61c3ca6feab6cb1e7dc77482 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 10 Feb 2016 11:50:37 -0500 Subject: tcp: __tcp_hdrlen() helper tcp_hdrlen is wasteful if you already have a pointer to struct tcphdr. This splits the size calculation into a helper function that can be used if a struct tcphdr is already available. Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- include/linux/tcp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d909feeeaea2..bcbf51da4e1e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -29,9 +29,14 @@ static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb) return (struct tcphdr *)skb_transport_header(skb); } +static inline unsigned int __tcp_hdrlen(const struct tcphdr *th) +{ + return th->doff * 4; +} + static inline unsigned int tcp_hdrlen(const struct sk_buff *skb) { - return tcp_hdr(skb)->doff * 4; + return __tcp_hdrlen(tcp_hdr(skb)); } static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb) -- cgit v1.2.3 From a583636a83ea383fd07517e5a7a2eedbc5d90fb1 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 10 Feb 2016 11:50:38 -0500 Subject: inet: refactor inet[6]_lookup functions to take skb This is a preliminary step to allow fast socket lookup of SO_REUSEPORT groups. Doing so with a BPF filter will require access to the skb in question. This change plumbs the skb (and offset to payload data) through the call stack to the listening socket lookup implementations where it will be used in a following patch. Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 ++ include/net/inet6_hashtables.h | 11 +++++++---- include/net/inet_hashtables.h | 18 ++++++++++++------ 3 files changed, 21 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 47f52d3cd8df..730d856683e5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -87,6 +87,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); +int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard); int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index b3c28a9dfbf1..28332bdac333 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -53,6 +53,7 @@ struct sock *__inet6_lookup_established(struct net *net, struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, @@ -60,6 +61,7 @@ struct sock *inet6_lookup_listener(struct net *net, static inline struct sock *__inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, @@ -71,12 +73,12 @@ static inline struct sock *__inet6_lookup(struct net *net, if (sk) return sk; - return inet6_lookup_listener(net, hashinfo, saddr, sport, + return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, hnum, dif); } static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, - struct sk_buff *skb, + struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, int iif) @@ -86,13 +88,14 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; - return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, - &ipv6_hdr(skb)->saddr, sport, + return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, + doff, &ipv6_hdr(skb)->saddr, sport, &ipv6_hdr(skb)->daddr, ntohs(dport), iif); } struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 554440e7f83d..82403390af58 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -213,6 +213,7 @@ void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const unsigned short hnum, @@ -220,10 +221,11 @@ struct sock *__inet_lookup_listener(struct net *net, static inline struct sock *inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { - return __inet_lookup_listener(net, hashinfo, saddr, sport, + return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, ntohs(dport), dif); } @@ -299,6 +301,7 @@ static inline struct sock * static inline struct sock *__inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) @@ -307,12 +310,13 @@ static inline struct sock *__inet_lookup(struct net *net, struct sock *sk = __inet_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif); - return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport, - daddr, hnum, dif); + return sk ? : __inet_lookup_listener(net, hashinfo, skb, doff, saddr, + sport, daddr, hnum, dif); } static inline struct sock *inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) @@ -320,7 +324,8 @@ static inline struct sock *inet_lookup(struct net *net, struct sock *sk; local_bh_disable(); - sk = __inet_lookup(net, hashinfo, saddr, sport, daddr, dport, dif); + sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + dport, dif); local_bh_enable(); return sk; @@ -328,6 +333,7 @@ static inline struct sock *inet_lookup(struct net *net, static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, + int doff, const __be16 sport, const __be16 dport) { @@ -337,8 +343,8 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; else - return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, - iph->saddr, sport, + return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, + doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb)); } -- cgit v1.2.3 From c125e80b88687b25b321795457309eaaee4bf270 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 10 Feb 2016 11:50:40 -0500 Subject: soreuseport: fast reuseport TCP socket selection This change extends the fast SO_REUSEPORT socket lookup implemented for UDP to TCP. Listener sockets with SO_REUSEPORT and the same receive address are additionally added to an array for faster random access. This means that only a single socket from the group must be found in the listener list before any socket in the group can be used to receive a packet. Previously, every socket in the group needed to be considered before handing off the incoming packet. This feature also exposes the ability to use a BPF program when selecting a socket from a reuseport group. Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 82403390af58..50f635c2c536 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -207,7 +207,10 @@ void inet_hashinfo_init(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk); -void __inet_hash(struct sock *sk, struct sock *osk); +int __inet_hash(struct sock *sk, struct sock *osk, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); -- cgit v1.2.3 From 12b74dfadb5a7a23baf4db941dc9fd9d371f249a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 Feb 2016 13:31:17 +0100 Subject: ipv4: add option to drop unicast encapsulated in L2 multicast In order to solve a problem with 802.11, the so-called hole-196 attack, add an option (sysctl) called "drop_unicast_in_l2_multicast" which, if enabled, causes the stack to drop IPv4 unicast packets encapsulated in link-layer multi- or broadcast frames. Such frames can (as an attack) be created by any member of the same wireless network and transmitted as valid encrypted frames since the symmetric key for broadcast frames is shared between all stations. Additionally, enabling this option provides compliance with a SHOULD clause of RFC 1122. Reviewed-by: Julian Anastasov Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/uapi/linux/ip.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 08f894d2ddbd..584834f7e95c 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -165,6 +165,7 @@ enum IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL, IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, + IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, __IPV4_DEVCONF_MAX }; -- cgit v1.2.3 From 97daf331455077645ae1f13438bebd3d1a2e94ee Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 Feb 2016 13:31:18 +0100 Subject: ipv4: add option to drop gratuitous ARP packets In certain 802.11 wireless deployments, there will be ARP proxies that use knowledge of the network to correctly answer requests. To prevent gratuitous ARP frames on the shared medium from being a problem, on such deployments wireless needs to drop them. Enable this by providing an option called "drop_gratuitous_arp". Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/uapi/linux/ip.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 584834f7e95c..f291569768dd 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -166,6 +166,7 @@ enum IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, + IPV4_DEVCONF_DROP_GRATUITOUS_ARP, __IPV4_DEVCONF_MAX }; -- cgit v1.2.3 From abbc30436d39dfed8ebfca338d253f211ac7b094 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 Feb 2016 13:31:19 +0100 Subject: ipv6: add option to drop unicast encapsulated in L2 multicast In order to solve a problem with 802.11, the so-called hole-196 attack, add an option (sysctl) called "drop_unicast_in_l2_multicast" which, if enabled, causes the stack to drop IPv6 unicast packets encapsulated in link-layer multi- or broadcast frames. Such frames can (as an attack) be created by any member of the same wireless network and transmitted as valid encrypted frames since the symmetric key for broadcast frames is shared between all stations. Reviewed-by: Julian Anastasov Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 402753bccafa..4a4c1ae826cb 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -50,6 +50,7 @@ struct ipv6_devconf { __s32 mc_forwarding; #endif __s32 disable_ipv6; + __s32 drop_unicast_in_l2_multicast; __s32 accept_dad; __s32 force_tllao; __s32 ndisc_notify; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 38b4fef20219..4c413570efe8 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -174,6 +174,7 @@ enum { DEVCONF_USE_OIF_ADDRS_ONLY, DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT, DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, + DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, DEVCONF_MAX }; -- cgit v1.2.3 From 7a02bf892d8f1e5298af1676f001bee410509d80 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 Feb 2016 13:31:20 +0100 Subject: ipv6: add option to drop unsolicited neighbor advertisements In certain 802.11 wireless deployments, there will be NA proxies that use knowledge of the network to correctly answer requests. To prevent unsolicitd advertisements on the shared medium from being a problem, on such deployments wireless needs to drop them. Enable this by providing an option called "drop_unsolicited_na". Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 4a4c1ae826cb..4b2267e1b7c3 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -56,6 +56,7 @@ struct ipv6_devconf { __s32 ndisc_notify; __s32 suppress_frag_ndisc; __s32 accept_ra_mtu; + __s32 drop_unsolicited_na; struct ipv6_stable_secret { bool initialized; struct in6_addr secret; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 4c413570efe8..ec117b65d5a5 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -175,6 +175,7 @@ enum { DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT, DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, + DEVCONF_DROP_UNSOLICITED_NA, DEVCONF_MAX }; -- cgit v1.2.3 From 72bb68721f80a1441e871b6afc9ab0b3793d5031 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 5 Feb 2016 11:16:21 +0000 Subject: ethtool: add IPv6 to the NFC API Signed-off-by: Edward Cree Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 70 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index b2e180181629..5e0940dcbfe8 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -748,6 +748,56 @@ struct ethtool_usrip4_spec { __u8 proto; }; +/** + * struct ethtool_tcpip6_spec - flow specification for TCP/IPv6 etc. + * @ip6src: Source host + * @ip6dst: Destination host + * @psrc: Source port + * @pdst: Destination port + * @tclass: Traffic Class + * + * This can be used to specify a TCP/IPv6, UDP/IPv6 or SCTP/IPv6 flow. + */ +struct ethtool_tcpip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be16 psrc; + __be16 pdst; + __u8 tclass; +}; + +/** + * struct ethtool_ah_espip6_spec - flow specification for IPsec/IPv6 + * @ip6src: Source host + * @ip6dst: Destination host + * @spi: Security parameters index + * @tclass: Traffic Class + * + * This can be used to specify an IPsec transport or tunnel over IPv6. + */ +struct ethtool_ah_espip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be32 spi; + __u8 tclass; +}; + +/** + * struct ethtool_usrip6_spec - general flow specification for IPv6 + * @ip6src: Source host + * @ip6dst: Destination host + * @l4_4_bytes: First 4 bytes of transport (layer 4) header + * @tclass: Traffic Class + * @l4_proto: Transport protocol number (nexthdr after any Extension Headers) + */ +struct ethtool_usrip6_spec { + __be32 ip6src[4]; + __be32 ip6dst[4]; + __be32 l4_4_bytes; + __u8 tclass; + __u8 l4_proto; +}; + union ethtool_flow_union { struct ethtool_tcpip4_spec tcp_ip4_spec; struct ethtool_tcpip4_spec udp_ip4_spec; @@ -755,6 +805,12 @@ union ethtool_flow_union { struct ethtool_ah_espip4_spec ah_ip4_spec; struct ethtool_ah_espip4_spec esp_ip4_spec; struct ethtool_usrip4_spec usr_ip4_spec; + struct ethtool_tcpip6_spec tcp_ip6_spec; + struct ethtool_tcpip6_spec udp_ip6_spec; + struct ethtool_tcpip6_spec sctp_ip6_spec; + struct ethtool_ah_espip6_spec ah_ip6_spec; + struct ethtool_ah_espip6_spec esp_ip6_spec; + struct ethtool_usrip6_spec usr_ip6_spec; struct ethhdr ether_spec; __u8 hdata[52]; }; @@ -1401,15 +1457,17 @@ static inline int ethtool_validate_duplex(__u8 duplex) #define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ #define SCTP_V4_FLOW 0x03 /* hash or spec (sctp_ip4_spec) */ #define AH_ESP_V4_FLOW 0x04 /* hash only */ -#define TCP_V6_FLOW 0x05 /* hash only */ -#define UDP_V6_FLOW 0x06 /* hash only */ -#define SCTP_V6_FLOW 0x07 /* hash only */ +#define TCP_V6_FLOW 0x05 /* hash or spec (tcp_ip6_spec; nfc only) */ +#define UDP_V6_FLOW 0x06 /* hash or spec (udp_ip6_spec; nfc only) */ +#define SCTP_V6_FLOW 0x07 /* hash or spec (sctp_ip6_spec; nfc only) */ #define AH_ESP_V6_FLOW 0x08 /* hash only */ #define AH_V4_FLOW 0x09 /* hash or spec (ah_ip4_spec) */ #define ESP_V4_FLOW 0x0a /* hash or spec (esp_ip4_spec) */ -#define AH_V6_FLOW 0x0b /* hash only */ -#define ESP_V6_FLOW 0x0c /* hash only */ -#define IP_USER_FLOW 0x0d /* spec only (usr_ip4_spec) */ +#define AH_V6_FLOW 0x0b /* hash or spec (ah_ip6_spec; nfc only) */ +#define ESP_V6_FLOW 0x0c /* hash or spec (esp_ip6_spec; nfc only) */ +#define IPV4_USER_FLOW 0x0d /* spec only (usr_ip4_spec) */ +#define IP_USER_FLOW IPV4_USER_FLOW +#define IPV6_USER_FLOW 0x0e /* spec only (usr_ip6_spec; nfc only) */ #define IPV4_FLOW 0x10 /* hash only */ #define IPV6_FLOW 0x11 /* hash only */ #define ETHER_FLOW 0x12 /* spec only (ether_spec) */ -- cgit v1.2.3 From 76443456227097179c14826425f88a95d81a892e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 5 Feb 2016 15:27:37 -0800 Subject: net: Move GSO csum into SKB_GSO_CB This patch moves the checksum maintained by GSO out of skb->csum and into the GSO context block in order to allow for us to work on outer checksums while maintaining the inner checksum offsets in the case of the inner checksum being offloaded, while the outer checksums will be computed. While updating the code I also did a minor cleanu-up on gso_make_checksum. The change is mostly to make it so that we store the values and compute the checksum instead of computing the checksum and then storing the values we needed to update. Signed-off-by: Alexander Duyck Acked-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 11f935c1a090..acece7ce376f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3549,6 +3549,7 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb) struct skb_gso_cb { int mac_offset; int encap_level; + __wsum csum; __u16 csum_start; }; #define SKB_SGO_CB_OFFSET 32 @@ -3585,15 +3586,14 @@ static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) */ static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res) { - int plen = SKB_GSO_CB(skb)->csum_start - skb_headroom(skb) - - skb_transport_offset(skb); - __wsum partial; + unsigned char *csum_start = skb_transport_header(skb); + int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start; + __wsum partial = SKB_GSO_CB(skb)->csum; - partial = csum_partial(skb_transport_header(skb), plen, skb->csum); - skb->csum = res; - SKB_GSO_CB(skb)->csum_start -= plen; + SKB_GSO_CB(skb)->csum = res; + SKB_GSO_CB(skb)->csum_start = csum_start - skb->head; - return csum_fold(partial); + return csum_fold(csum_partial(csum_start, plen, partial)); } static inline bool skb_is_gso(const struct sk_buff *skb) -- cgit v1.2.3 From 08b64fcca942733413bc5ac2321d57021d3e8578 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 5 Feb 2016 15:27:49 -0800 Subject: net: Store checksum result for offloaded GSO checksums This patch makes it so that we can offload the checksums for a packet up to a certain point and then begin computing the checksums via software. Setting this up is fairly straight forward as all we need to do is reset the values stored in csum and csum_start for the GSO context block. One complication for this is remote checksum offload. In order to allow the inner checksums to be offloaded while computing the outer checksum manually we needed to have some way of indicating that the offload wasn't real. In order to do that I replaced CHECKSUM_PARTIAL with CHECKSUM_UNNECESSARY in the case of us computing checksums for the outer header while skipping computing checksums for the inner headers. We clean up the ip_summed flag and set it to either CHECKSUM_PARTIAL or CHECKSUM_NONE once we hand the packet off to the next lower level. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/skbuff.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index acece7ce376f..a8fc2220e8ce 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2161,6 +2161,11 @@ static inline int skb_checksum_start_offset(const struct sk_buff *skb) return skb->csum_start - skb_headroom(skb); } +static inline unsigned char *skb_checksum_start(const struct sk_buff *skb) +{ + return skb->head + skb->csum_start; +} + static inline int skb_transport_offset(const struct sk_buff *skb) { return skb_transport_header(skb) - skb->data; @@ -3576,6 +3581,16 @@ static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) return 0; } +static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res) +{ + /* Do not update partial checksums if remote checksum is enabled. */ + if (skb->remcsum_offload) + return; + + SKB_GSO_CB(skb)->csum = res; + SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head; +} + /* Compute the checksum for a gso segment. First compute the checksum value * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and * then add in skb->csum (checksum from csum_start to end of packet). -- cgit v1.2.3 From 4456ed04ea44b800d691b18c14a68ec9894d2aca Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 7 Feb 2016 23:27:55 +0200 Subject: ethtool: future-proof interface for speed extensions Many virtual and not quite virtual devices allow any speed to be set through ethtool. In particular, this applies to the virtio-net devices. Document this fact to make sure people don't assume the enum lists all possible values. Reserve values greater than INT_MAX for future extension and to avoid conflict with SPEED_UNKNOWN. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 5e0940dcbfe8..4345f80a2e33 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -31,7 +31,7 @@ * physical connectors and other link features that are * advertised through autonegotiation or enabled for * auto-detection. - * @speed: Low bits of the speed + * @speed: Low bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN * @duplex: Duplex mode; one of %DUPLEX_* * @port: Physical connector type; one of %PORT_* * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not @@ -47,7 +47,7 @@ * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. * @maxrxpkt: Historically used to report RX IRQ coalescing; now * obsoleted by &struct ethtool_coalesce. Read-only; deprecated. - * @speed_hi: High bits of the speed + * @speed_hi: High bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of * %ETH_TP_MDI_*. If the status is unknown or not applicable, the * value will be %ETH_TP_MDI_INVALID. Read-only. @@ -1359,7 +1359,7 @@ enum ethtool_sfeatures_retval_bits { * it was forced up into this mode or autonegotiated. */ -/* The forced speed, 10Mb, 100Mb, gigabit, [2.5|5|10|20|25|40|50|56|100]GbE. */ +/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. */ #define SPEED_10 10 #define SPEED_100 100 #define SPEED_1000 1000 -- cgit v1.2.3 From 4a92602aa1cd5bbaeedbd9536ff992f7d26fe9d1 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Fri, 5 Feb 2016 09:20:52 -0700 Subject: openvswitch: allow management from inside user namespaces Operations with the GENL_ADMIN_PERM flag fail permissions checks because this flag means we call netlink_capable, which uses the init user ns. Instead, let's introduce a new flag, GENL_UNS_ADMIN_PERM for operations which should be allowed inside a user namespace. The motivation for this is to be able to run openvswitch in unprivileged containers. I've tested this and it seems to work, but I really have no idea about the security consequences of this patch, so thoughts would be much appreciated. v2: use the GENL_UNS_ADMIN_PERM flag instead of a check in each function v3: use separate ifs for UNS_ADMIN_PERM and ADMIN_PERM, instead of one massive one Reported-by: James Page Signed-off-by: Tycho Andersen CC: Eric Biederman CC: Pravin Shelar CC: Justin Pettit CC: "David S. Miller" Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/genetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index c3363ba1ae05..5512c90af7e3 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -21,6 +21,7 @@ struct genlmsghdr { #define GENL_CMD_CAP_DO 0x02 #define GENL_CMD_CAP_DUMP 0x04 #define GENL_CMD_CAP_HASPOL 0x08 +#define GENL_UNS_ADMIN_PERM 0x10 /* * List of reserved static generic netlink identifiers: -- cgit v1.2.3 From 815c52700746cdcc0874a33390bac334a4b90107 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 8 Feb 2016 23:29:21 +0200 Subject: igmp: Namespaceify igmp_max_memberships sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/linux/igmp.h | 1 - include/net/netns/ipv4.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 9c9de11549a7..57d6d06ce0b3 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -38,7 +38,6 @@ static inline struct igmpv3_query * } extern int sysctl_igmp_llm_reports; -extern int sysctl_igmp_max_memberships; extern int sysctl_igmp_max_msf; extern int sysctl_igmp_qrv; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 4d6ec3f6fafe..759cf624eec2 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -108,6 +108,8 @@ struct netns_ipv4 { int sysctl_tcp_fin_timeout; unsigned int sysctl_tcp_notsent_lowat; + int sysctl_igmp_max_memberships; + struct ping_group_range ping_group_range; atomic_t dev_addr_genid; -- cgit v1.2.3 From 166b6b2d6f01be67a83b87ab5c91350a68b17115 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 8 Feb 2016 23:29:22 +0200 Subject: igmp: Namespaceify igmp_max_msf sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/linux/igmp.h | 1 - include/net/netns/ipv4.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 57d6d06ce0b3..a91ec9f575e7 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -38,7 +38,6 @@ static inline struct igmpv3_query * } extern int sysctl_igmp_llm_reports; -extern int sysctl_igmp_max_msf; extern int sysctl_igmp_qrv; struct ip_sf_socklist { diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 759cf624eec2..522a2cfe1ad9 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -109,6 +109,7 @@ struct netns_ipv4 { unsigned int sysctl_tcp_notsent_lowat; int sysctl_igmp_max_memberships; + int sysctl_igmp_max_msf; struct ping_group_range ping_group_range; -- cgit v1.2.3 From 87a8a2ae65b7721893c7922f963502be8fa01c94 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 9 Feb 2016 00:13:50 +0200 Subject: igmp: Namespaceify igmp_llm_reports sysctl knob This was initially introduced in df2cf4a78e488d26 ("IGMP: Inhibit reports for local multicast groups") by defining the sysctl in the ipv4_net_table array, however it was never implemented to be namespace aware. Fix this by changing the code accordingly. Signed-off-by: David S. Miller --- include/linux/igmp.h | 1 - include/net/netns/ipv4.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index a91ec9f575e7..c683f4bf642b 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -37,7 +37,6 @@ static inline struct igmpv3_query * return (struct igmpv3_query *)skb_transport_header(skb); } -extern int sysctl_igmp_llm_reports; extern int sysctl_igmp_qrv; struct ip_sf_socklist { diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 522a2cfe1ad9..cbbf8115e8a7 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -110,6 +110,7 @@ struct netns_ipv4 { int sysctl_igmp_max_memberships; int sysctl_igmp_max_msf; + int sysctl_igmp_llm_reports; struct ping_group_range ping_group_range; -- cgit v1.2.3 From 165094afcee79e4d5b6e94032a5d3be157460b4a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 8 Feb 2016 23:29:24 +0200 Subject: igmp: Namespacify igmp_qrv sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/linux/igmp.h | 2 -- include/net/netns/ipv4.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index c683f4bf642b..12f6fba6d21a 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -37,8 +37,6 @@ static inline struct igmpv3_query * return (struct igmpv3_query *)skb_transport_header(skb); } -extern int sysctl_igmp_qrv; - struct ip_sf_socklist { unsigned int sl_max; unsigned int sl_count; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index cbbf8115e8a7..848fe8056534 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -111,6 +111,7 @@ struct netns_ipv4 { int sysctl_igmp_max_memberships; int sysctl_igmp_max_msf; int sysctl_igmp_llm_reports; + int sysctl_igmp_qrv; struct ping_group_range ping_group_range; -- cgit v1.2.3 From e02564ee334a7ae46b71fc18576391cb9455433e Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Sun, 7 Feb 2016 21:52:23 +0100 Subject: ethtool: make validate_speed accept all speeds between 0 and INT_MAX Devices these days can have any speed and as was recently pointed out any speed from 0 to INT_MAX is valid so adjust speed validation to accept such values. Signed-off-by: Nikolay Aleksandrov Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 4345f80a2e33..190aea0faaf4 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1377,24 +1377,7 @@ enum ethtool_sfeatures_retval_bits { static inline int ethtool_validate_speed(__u32 speed) { - switch (speed) { - case SPEED_10: - case SPEED_100: - case SPEED_1000: - case SPEED_2500: - case SPEED_5000: - case SPEED_10000: - case SPEED_20000: - case SPEED_25000: - case SPEED_40000: - case SPEED_50000: - case SPEED_56000: - case SPEED_100000: - case SPEED_UNKNOWN: - return 1; - } - - return 0; + return speed <= INT_MAX || speed == SPEED_UNKNOWN; } /* Duplex, half or full. */ -- cgit v1.2.3 From 795bb1c00dd338aa0d12f9a7f1f4776fb3160416 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 8 Feb 2016 13:14:59 +0100 Subject: net: bulk free infrastructure for NAPI context, use napi_consume_skb Discovered that network stack were hitting the kmem_cache/SLUB slowpath when freeing SKBs. Doing bulk free with kmem_cache_free_bulk can speedup this slowpath. NAPI context is a bit special, lets take advantage of that for bulk free'ing SKBs. In NAPI context we are running in softirq, which gives us certain protection. A softirq can run on several CPUs at once. BUT the important part is a softirq will never preempt another softirq running on the same CPU. This gives us the opportunity to access per-cpu variables in softirq context. Extend napi_alloc_cache (before only contained page_frag_cache) to be a struct with a small array based stack for holding SKBs. Introduce a SKB defer and flush API for accessing this. Introduce napi_consume_skb() as replacement for e.g. dev_consume_skb_any() when running in NAPI context. A small trick to handle/detect if we are called from netpoll is to see if budget is 0. In that case, we need to invoke dev_consume_skb_irq(). Joint work with Alexander Duyck. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a8fc2220e8ce..b56c0103fa15 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2404,6 +2404,9 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, { return __napi_alloc_skb(napi, length, GFP_ATOMIC); } +void napi_consume_skb(struct sk_buff *skb, int budget); + +void __kfree_skb_flush(void); /** * __dev_alloc_pages - allocate page for network Rx -- cgit v1.2.3 From 15fad714be86eab13e7568fecaf475b2a9730d3e Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 8 Feb 2016 13:15:04 +0100 Subject: net: bulk free SKBs that were delay free'ed due to IRQ context The network stack defers SKBs free, in-case free happens in IRQ or when IRQs are disabled. This happens in __dev_kfree_skb_irq() that writes SKBs that were free'ed during IRQ to the softirq completion queue (softnet_data.completion_queue). These SKBs are naturally delayed, and cleaned up during NET_TX_SOFTIRQ in function net_tx_action(). Take advantage of this a use the skb defer and flush API, as we are already in softirq context. For modern drivers this rarely happens. Although most drivers do call dev_kfree_skb_any(), which detects the situation and calls __dev_kfree_skb_irq() when needed. This due to netpoll can call from IRQ context. Signed-off-by: Alexander Duyck Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b56c0103fa15..6ec86f1a2ed9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2407,6 +2407,7 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, void napi_consume_skb(struct sk_buff *skb, int budget); void __kfree_skb_flush(void); +void __kfree_skb_defer(struct sk_buff *skb); /** * __dev_alloc_pages - allocate page for network Rx -- cgit v1.2.3 From 179bc67f69b6cb53ad68cfdec5a917c2a2248355 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 11 Feb 2016 20:48:04 +0000 Subject: net: local checksum offload for encapsulation The arithmetic properties of the ones-complement checksum mean that a correctly checksummed inner packet, including its checksum, has a ones complement sum depending only on whatever value was used to initialise the checksum field before checksumming (in the case of TCP and UDP, this is the ones complement sum of the pseudo header, complemented). Consequently, if we are going to offload the inner checksum with CHECKSUM_PARTIAL, we can compute the outer checksum based only on the packed data not covered by the inner checksum, and the initial value of the inner checksum field. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/skbuff.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6ec86f1a2ed9..cf906d1ce8a7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3702,5 +3702,29 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb) return hdr_len + skb_gso_transport_seglen(skb); } +/* Local Checksum Offload. + * Compute outer checksum based on the assumption that the + * inner checksum will be offloaded later. + * Fill in outer checksum adjustment (e.g. with sum of outer + * pseudo-header) before calling. + * Also ensure that inner checksum is in linear data area. + */ +static inline __wsum lco_csum(struct sk_buff *skb) +{ + char *inner_csum_field; + __wsum csum; + + /* Start with complement of inner checksum adjustment */ + inner_csum_field = skb->data + skb_checksum_start_offset(skb) + + skb->csum_offset; + csum = ~csum_unfold(*(__force __sum16 *)inner_csum_field); + /* Add in checksum of our headers (incl. outer checksum + * adjustment filled in by caller) + */ + csum = skb_checksum(skb, 0, skb_checksum_start_offset(skb), csum); + /* The result is the checksum from skb->data to end of packet */ + return csum; +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ -- cgit v1.2.3 From 21e2e7f9b5fefdbf94a107a9b24d74baa5148ef3 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 11 Feb 2016 20:50:44 +0000 Subject: net: enable LCO for udp_tunnel_handle_offloads() users The only protocol affected at present is Geneve. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/net/udp_tunnel.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index cca2ad3082c3..734c15662ea9 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -103,7 +103,8 @@ static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb, { int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; - return iptunnel_handle_offloads(skb, udp_csum, type); + /* As we're a UDP tunnel, we support LCO, so don't need csum_help */ + return iptunnel_handle_offloads(skb, false, type); } static inline void udp_tunnel_gro_complete(struct sk_buff *skb, int nhoff) -- cgit v1.2.3 From 6fa79666e24d32be1b709f5269af41ed9e829e7e Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 11 Feb 2016 21:02:31 +0000 Subject: net: ip_tunnel: remove 'csum_help' argument to iptunnel_handle_offloads All users now pass false, so we can remove it, and remove the code that was conditional upon it. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 3 +-- include/net/udp_tunnel.h | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 6db96ea0144f..bc439f32baa9 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -279,8 +279,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); -struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum, - int gso_type_mask); +struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask); static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) { diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 734c15662ea9..97f5adb121a6 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -103,8 +103,7 @@ static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb, { int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; - /* As we're a UDP tunnel, we support LCO, so don't need csum_help */ - return iptunnel_handle_offloads(skb, false, type); + return iptunnel_handle_offloads(skb, type); } static inline void udp_tunnel_gro_complete(struct sk_buff *skb, int nhoff) -- cgit v1.2.3 From e8ae7b000e64cf76283c72cae5e3ecd246618ef4 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 11 Feb 2016 21:03:37 +0000 Subject: Documentation/networking: add checksum-offloads.txt to explain LCO Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cf906d1ce8a7..39206751463e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3705,6 +3705,8 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb) /* Local Checksum Offload. * Compute outer checksum based on the assumption that the * inner checksum will be offloaded later. + * See Documentation/networking/checksum-offloads.txt for + * explanation of how this works. * Fill in outer checksum adjustment (e.g. with sum of outer * pseudo-header) before calling. * Also ensure that inner checksum is in linear data area. -- cgit v1.2.3 From d4ab4286276fcd6c155bafdf4422b712068d2516 Mon Sep 17 00:00:00 2001 From: "Keller, Jacob E" Date: Mon, 8 Feb 2016 16:05:03 -0800 Subject: ethtool: correctly ensure {GS}CHANNELS doesn't conflict with GS{RXFH} Ethernet drivers implementing both {GS}RXFH and {GS}CHANNELS ethtool ops incorrectly allow SCHANNELS when it would conflict with the settings from SRXFH. This occurs because it is not possible for drivers to understand whether their Rx flow indirection table has been configured or is in the default state. In addition, drivers currently behave in various ways when increasing the number of Rx channels. Some drivers will always destroy the Rx flow indirection table when this occurs, whether it has been set by the user or not. Other drivers will attempt to preserve the table even if the user has never modified it from the default driver settings. Neither of these situation is desirable because it leads to unexpected behavior or loss of user configuration. The correct behavior is to simply return -EINVAL when SCHANNELS would conflict with the current Rx flow table settings. However, it should only do so if the current settings were modified by the user. If we required that the new settings never conflict with the current (default) Rx flow settings, we would force users to first reduce their Rx flow settings and then reduce the number of Rx channels. This patch proposes a solution implemented in net/core/ethtool.c which ensures that all drivers behave correctly. It checks whether the RXFH table has been configured to non-default settings, and stores this information in a private netdev flag. When the number of channels is requested to change, it first ensures that the current Rx flow table is not going to assign flows to now disabled channels. Signed-off-by: Jacob Keller Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 219f53c30cb3..0499569c256d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1291,6 +1291,7 @@ struct net_device_ops { * @IFF_OPENVSWITCH: device is a Open vSwitch master * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device * @IFF_TEAM: device is a team device + * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1318,6 +1319,7 @@ enum netdev_priv_flags { IFF_OPENVSWITCH = 1<<22, IFF_L3MDEV_SLAVE = 1<<23, IFF_TEAM = 1<<24, + IFF_RXFH_CONFIGURED = 1<<25, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1345,6 +1347,7 @@ enum netdev_priv_flags { #define IFF_OPENVSWITCH IFF_OPENVSWITCH #define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE #define IFF_TEAM IFF_TEAM +#define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED /** * struct net_device - The DEVICE structure. @@ -4048,6 +4051,11 @@ static inline bool netif_is_lag_port(const struct net_device *dev) return netif_is_bond_slave(dev) || netif_is_team_port(dev); } +static inline bool netif_is_rxfh_configured(const struct net_device *dev) +{ + return dev->priv_flags & IFF_RXFH_CONFIGURED; +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { -- cgit v1.2.3 From 911362c70df5b766c243dc297fadeaced786ffd8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:53 +0100 Subject: net: add dst_cache support This patch add a generic, lockless dst cache implementation. The need for lock is avoided updating the dst cache fields only in per cpu scope, and requiring that the cache manipulation functions are invoked with the local bh disabled. The refresh_ts and reset_ts fields are used to ensure the cache consistency in case of cuncurrent cache update (dst_cache_set*) and reset operation (dst_cache_reset). Consider the following scenario: CPU1: CPU2: dst_cache_reset() dst_cache_set() The dst entry set passed to dst_cache_set() should not be used for later dst cache lookup, because it's obtained using old configuration values. Since the refresh_ts is updated only on dst_cache lookup, the cached value in the above scenario will be discarded on the next lookup. Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/dst_cache.h | 97 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 include/net/dst_cache.h (limited to 'include') diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h new file mode 100644 index 000000000000..151accae708b --- /dev/null +++ b/include/net/dst_cache.h @@ -0,0 +1,97 @@ +#ifndef _NET_DST_CACHE_H +#define _NET_DST_CACHE_H + +#include +#include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif + +struct dst_cache { + struct dst_cache_pcpu __percpu *cache; + unsigned long reset_ts; +}; + +/** + * dst_cache_get - perform cache lookup + * @dst_cache: the cache + * + * The caller should use dst_cache_get_ip4() if it need to retrieve the + * source address to be used when xmitting to the cached dst. + * local BH must be disabled. + */ +struct dst_entry *dst_cache_get(struct dst_cache *dst_cache); + +/** + * dst_cache_get_ip4 - perform cache lookup and fetch ipv4 source address + * @dst_cache: the cache + * @saddr: return value for the retrieved source address + * + * local BH must be disabled. + */ +struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr); + +/** + * dst_cache_set_ip4 - store the ipv4 dst into the cache + * @dst_cache: the cache + * @dst: the entry to be cached + * @saddr: the source address to be stored inside the cache + * + * local BH must be disabled. + */ +void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr); + +#if IS_ENABLED(CONFIG_IPV6) + +/** + * dst_cache_set_ip6 - store the ipv6 dst into the cache + * @dst_cache: the cache + * @dst: the entry to be cached + * @saddr: the source address to be stored inside the cache + * + * local BH must be disabled. + */ +void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, + const struct in6_addr *addr); + +/** + * dst_cache_get_ip6 - perform cache lookup and fetch ipv6 source address + * @dst_cache: the cache + * @saddr: return value for the retrieved source address + * + * local BH must be disabled. + */ +struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, + struct in6_addr *saddr); +#endif + +/** + * dst_cache_reset - invalidate the cache contents + * @dst_cache: the cache + * + * This do not free the cached dst to avoid races and contentions. + * the dst will be freed on later cache lookup. + */ +static inline void dst_cache_reset(struct dst_cache *dst_cache) +{ + dst_cache->reset_ts = jiffies; +} + +/** + * dst_cache_init - initialize the cache, allocating the required storage + * @dst_cache: the cache + * @gfp: allocation flags + */ +int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp); + +/** + * dst_cache_destroy - empty the cache and free the allocated storage + * @dst_cache: the cache + * + * No synchronization is enforced: it must be called only when the cache + * is unsed. + */ +void dst_cache_destroy(struct dst_cache *dst_cache); + +#endif -- cgit v1.2.3 From 607f725f6f7d5ec3759fbc16224afb60e2152a5b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:54 +0100 Subject: net: replace dst_cache ip6_tunnel implementation with the generic one This also fix a potential race into the existing tunnel code, which could lead to the wrong dst to be permanenty cached: CPU1: CPU2: dst = ip6_route_output(...) dst_cache_reset() // no effect, // the cache is empty dst_cache_set() // the wrong dst // is permanenty stored // into the cache With the new dst implementation the above race is not possible since the first cache lookup after dst_cache_reset will fail due to the timestamp check Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 0d0ce0b2d870..499a707765ea 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -6,6 +6,7 @@ #include #include #include +#include #define IP6TUNNEL_ERR_TIMEO (30*HZ) @@ -33,12 +34,6 @@ struct __ip6_tnl_parm { __be32 o_key; }; -struct ip6_tnl_dst { - seqlock_t lock; - struct dst_entry __rcu *dst; - u32 cookie; -}; - /* IPv6 tunnel */ struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ @@ -46,7 +41,7 @@ struct ip6_tnl { struct net *net; /* netns for packet i/o */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct flowi fl; /* flowi template for xmit */ - struct ip6_tnl_dst __percpu *dst_cache; /* cached dst */ + struct dst_cache dst_cache; /* cached dst */ int err_count; unsigned long err_time; @@ -66,11 +61,6 @@ struct ipv6_tlv_tnl_enc_lim { __u8 encap_limit; /* tunnel encapsulation limit */ } __packed; -struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t); -int ip6_tnl_dst_init(struct ip6_tnl *t); -void ip6_tnl_dst_destroy(struct ip6_tnl *t); -void ip6_tnl_dst_reset(struct ip6_tnl *t); -void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst); int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, -- cgit v1.2.3 From e09acddf873bf775b208b452a4c3a3fd26fa9427 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:55 +0100 Subject: ip_tunnel: replace dst_cache with generic implementation The current ip_tunnel cache implementation is prone to a race that will cause the wrong dst to be cached on cuncurrent dst cache miss and ip tunnel update via netlink. Replacing with the generic implementation fix the issue. Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index bc439f32baa9..fd36936d85a6 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -13,6 +13,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -85,11 +86,6 @@ struct ip_tunnel_prl_entry { struct rcu_head rcu_head; }; -struct ip_tunnel_dst { - struct dst_entry __rcu *dst; - __be32 saddr; -}; - struct metadata_dst; struct ip_tunnel { @@ -108,7 +104,7 @@ struct ip_tunnel { int tun_hlen; /* Precalculated header length */ int mlink; - struct ip_tunnel_dst __percpu *dst_cache; + struct dst_cache dst_cache; struct ip_tunnel_parm parms; @@ -247,7 +243,6 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p); void ip_tunnel_setup(struct net_device *dev, int net_id); -void ip_tunnel_dst_reset_all(struct ip_tunnel *t); int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); -- cgit v1.2.3 From 0c1d70af924b966cc71e9e48920b2b635441aa50 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:56 +0100 Subject: net: use dst_cache for vxlan device In case of UDP traffic with datagram length below MTU this give about 3% performance increase when tunneling over ipv4 and about 70% when tunneling over ipv6. Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/vxlan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 25bd919c9ef0..b314e4af89c5 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -148,6 +148,7 @@ struct vxlan_rdst { u32 remote_ifindex; struct list_head list; struct rcu_head rcu; + struct dst_cache dst_cache; }; struct vxlan_config { -- cgit v1.2.3 From d71785ffc7e7cae3fbdc4ea8a9d05b7a1c59f7b8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:57 +0100 Subject: net: add dst_cache to ovs vxlan lwtunnel In case of UDP traffic with datagram length below MTU this give about 2% performance increase when tunneling over ipv4 and about 60% when tunneling over ipv6 Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 + include/net/ip_tunnels.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 30a56ab2ccfb..84b833af6882 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -62,6 +62,7 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, sizeof(a->u.tun_info) + a->u.tun_info.options_len); } +void metadata_dst_free(struct metadata_dst *); struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index fd36936d85a6..87408ab80856 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -58,6 +58,9 @@ struct ip_tunnel_key { struct ip_tunnel_info { struct ip_tunnel_key key; +#ifdef CONFIG_DST_CACHE + struct dst_cache dst_cache; +#endif u8 options_len; u8 mode; }; -- cgit v1.2.3 From cd9b266095f422267bddbec88f9098b48ea548fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Feb 2016 22:02:53 -0800 Subject: tcp: add tcpi_min_rtt and tcpi_notsent_bytes to tcp_info tcpi_min_rtt reports the minimal rtt observed by TCP stack for the flow, in usec unit. Might be ~0U if not yet known. tcpi_notsent_bytes reports the amount of bytes in the write queue that were not yet sent. This is done in a single patch to not add a temporary 32bit padding hole in tcp_info. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 65a77b071e22..fe95446e9abf 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -196,6 +196,9 @@ struct tcp_info { __u64 tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */ __u32 tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */ __u32 tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */ + + __u32 tcpi_notsent_bytes; + __u32 tcpi_min_rtt; }; /* for TCP_MD5SIG socket option */ -- cgit v1.2.3 From fa50d974d104113630d68b7d03233a6686230d0c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 15 Feb 2016 12:11:27 +0200 Subject: ipv4: Namespaceify ip_default_ttl sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/route.h | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 848fe8056534..bc8f7f94abcb 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -80,6 +80,7 @@ struct netns_ipv4 { int sysctl_tcp_ecn; int sysctl_tcp_ecn_fallback; + int sysctl_ip_default_ttl; int sysctl_ip_no_pmtu_disc; int sysctl_ip_fwd_use_pmtu; int sysctl_ip_nonlocal_bind; diff --git a/include/net/route.h b/include/net/route.h index a3b9ef74a389..9b0a523bb428 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -329,14 +329,13 @@ static inline int inet_iif(const struct sk_buff *skb) return skb->skb_iif; } -extern int sysctl_ip_default_ttl; - static inline int ip4_dst_hoplimit(const struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); + struct net *net = dev_net(dst->dev); if (hoplimit == 0) - hoplimit = sysctl_ip_default_ttl; + hoplimit = net->ipv4.sysctl_ip_default_ttl; return hoplimit; } -- cgit v1.2.3 From 287b7f38fd6842e534db1783cead3843f7677b79 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 15 Feb 2016 12:11:29 +0200 Subject: ipv4: Namespacify ip_dynaddr sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/ip.h | 3 --- include/net/netns/ipv4.h | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 1a98f1ca1638..e3fb25d76421 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -248,9 +248,6 @@ extern int inet_peer_maxttl; /* From ip_input.c */ extern int sysctl_ip_early_demux; -/* From ip_output.c */ -extern int sysctl_ip_dynaddr; - void ipfrag_init(void); void ip_static_sysctl_init(void); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index bc8f7f94abcb..b7e3fb2587da 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -84,6 +84,8 @@ struct netns_ipv4 { int sysctl_ip_no_pmtu_disc; int sysctl_ip_fwd_use_pmtu; int sysctl_ip_nonlocal_bind; + /* Shall we try to damage output packets if routing dev changes? */ + int sysctl_ip_dynaddr; int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; -- cgit v1.2.3 From e21145a9871aa5ae07e01926105bb8e523d64095 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 15 Feb 2016 12:11:30 +0200 Subject: ipv4: namespacify ip_early_demux sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/ip.h | 3 --- include/net/netns/ipv4.h | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index e3fb25d76421..cbb134b2f0e4 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -245,9 +245,6 @@ extern int inet_peer_threshold; extern int inet_peer_minttl; extern int inet_peer_maxttl; -/* From ip_input.c */ -extern int sysctl_ip_early_demux; - void ipfrag_init(void); void ip_static_sysctl_init(void); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index b7e3fb2587da..a69cde3ce460 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -86,6 +86,7 @@ struct netns_ipv4 { int sysctl_ip_nonlocal_bind; /* Shall we try to damage output packets if routing dev changes? */ int sysctl_ip_dynaddr; + int sysctl_ip_early_demux; int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; -- cgit v1.2.3 From 0fbf4cb27e061204c8cee8e7eb2870416bdf30fd Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 15 Feb 2016 12:11:31 +0200 Subject: ipv4: namespacify ip fragment max dist sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/inet_frag.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 12aac0fd6ee7..909972aa3acd 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -13,6 +13,7 @@ struct netns_frags { int timeout; int high_thresh; int low_thresh; + int max_dist; }; /** -- cgit v1.2.3 From e4c6734eaab90695db0ea8456307790cb0c1ccb5 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Feb 2016 21:16:15 -0800 Subject: net: rework ndo tc op to consume additional qdisc handle parameter The ndo_setup_tc() op was added to support drivers offloading tx qdiscs however only support for mqprio was ever added. So we only ever added support for passing the number of traffic classes to the driver. This patch generalizes the ndo_setup_tc op so that a handle can be provided to indicate if the offload is for ingress or egress or potentially even child qdiscs. CC: Murali Karicheri CC: Shradha Shah CC: Or Gerlitz CC: Ariel Elior CC: Jeff Kirsher CC: Bruce Allan CC: Jesse Brandeburg CC: Don Skidmore Signed-off-by: John Fastabend Acked-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0499569c256d..48928b6f9cb6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -51,6 +51,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -1150,7 +1151,7 @@ struct net_device_ops { int (*ndo_set_vf_rss_query_en)( struct net_device *dev, int vf, bool setting); - int (*ndo_setup_tc)(struct net_device *dev, u8 tc); + int (*ndo_setup_tc)(struct net_device *dev, u32 handle, u8 tc); #if IS_ENABLED(CONFIG_FCOE) int (*ndo_fcoe_enable)(struct net_device *dev); int (*ndo_fcoe_disable)(struct net_device *dev); -- cgit v1.2.3 From 16e5cc647173a97e33b3e3ba81f73eb455561794 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Feb 2016 21:16:43 -0800 Subject: net: rework setup_tc ndo op to consume general tc operand This patch updates setup_tc so we can pass additional parameters into the ndo op in a generic way. To do this we provide structured union and type flag. This lets each classifier and qdisc provide its own set of attributes without having to add new ndo ops or grow the signature of the callback. Signed-off-by: John Fastabend Acked-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 48928b6f9cb6..e396060f815f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -779,6 +779,21 @@ static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a, typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb); +/* This structure holds attributes of qdisc and classifiers + * that are being passed to the netdevice through the setup_tc op. + */ +enum { + TC_SETUP_MQPRIO, +}; + +struct tc_to_netdev { + unsigned int type; + union { + u8 tc; + }; +}; + + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1151,7 +1166,10 @@ struct net_device_ops { int (*ndo_set_vf_rss_query_en)( struct net_device *dev, int vf, bool setting); - int (*ndo_setup_tc)(struct net_device *dev, u32 handle, u8 tc); + int (*ndo_setup_tc)(struct net_device *dev, + u32 handle, + __be16 protocol, + struct tc_to_netdev *tc); #if IS_ENABLED(CONFIG_FCOE) int (*ndo_fcoe_enable)(struct net_device *dev); int (*ndo_fcoe_disable)(struct net_device *dev); -- cgit v1.2.3 From a1b7c5fd7fe98f51fbbc393ee1fc4c1cdb2f0119 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Feb 2016 21:17:09 -0800 Subject: net: sched: add cls_u32 offload hooks for netdevs This patch allows netdev drivers to consume cls_u32 offloads via the ndo_setup_tc ndo op. This works aligns with how network drivers have been doing qdisc offloads for mqprio. Signed-off-by: John Fastabend Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++++- include/net/pkt_cls.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e396060f815f..47671ce04ac4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -779,17 +779,21 @@ static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a, typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb); -/* This structure holds attributes of qdisc and classifiers +/* These structures hold the attributes of qdisc and classifiers * that are being passed to the netdevice through the setup_tc op. */ enum { TC_SETUP_MQPRIO, + TC_SETUP_CLSU32, }; +struct tc_cls_u32_offload; + struct tc_to_netdev { unsigned int type; union { u8 tc; + struct tc_cls_u32_offload *cls_u32; }; }; diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index bc49967e1a68..59789ca6e2c8 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -358,4 +358,38 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) } #endif /* CONFIG_NET_CLS_IND */ +struct tc_cls_u32_knode { + struct tcf_exts *exts; + u8 fshift; + u32 handle; + u32 val; + u32 mask; + u32 link_handle; + struct tc_u32_sel *sel; +}; + +struct tc_cls_u32_hnode { + u32 handle; + u32 prio; + unsigned int divisor; +}; + +enum tc_clsu32_command { + TC_CLSU32_NEW_KNODE, + TC_CLSU32_REPLACE_KNODE, + TC_CLSU32_DELETE_KNODE, + TC_CLSU32_NEW_HNODE, + TC_CLSU32_REPLACE_HNODE, + TC_CLSU32_DELETE_HNODE, +}; + +struct tc_cls_u32_offload { + /* knode values */ + enum tc_clsu32_command command; + union { + struct tc_cls_u32_knode knode; + struct tc_cls_u32_hnode hnode; + }; +}; + #endif -- cgit v1.2.3 From 1c78c64e9c6f43a490427d55cd2d213b7c6795c1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Feb 2016 21:17:37 -0800 Subject: net: add tc offload feature flag Its useful to turn off the qdisc offload feature at a per device level. This gives us a big hammer to enable/disable offloading. More fine grained control (i.e. per rule) may be supported later. Signed-off-by: John Fastabend Acked-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index d9654f0eecb3..a734bf43d190 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -67,6 +67,8 @@ enum { NETIF_F_HW_L2FW_DOFFLOAD_BIT, /* Allow L2 Forwarding in Hardware */ NETIF_F_BUSY_POLL_BIT, /* Busy poll */ + NETIF_F_HW_TC_BIT, /* Offload TC infrastructure */ + /* * Add your fresh new feature above and remember to update * netdev_features_strings[] in net/core/ethtool.c and maybe @@ -124,6 +126,7 @@ enum { #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX) #define NETIF_F_HW_L2FW_DOFFLOAD __NETIF_F(HW_L2FW_DOFFLOAD) #define NETIF_F_BUSY_POLL __NETIF_F(BUSY_POLL) +#define NETIF_F_HW_TC __NETIF_F(HW_TC) #define for_each_netdev_feature(mask_addr, bit) \ for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) -- cgit v1.2.3 From 3b01cf56daf96acf9b155d6201d94bc8b4de218e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Feb 2016 21:18:03 -0800 Subject: net: tc: helper functions to query action types This is a helper function drivers can use to learn if the action type is a drop action. Signed-off-by: John Fastabend Acked-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/tc_act/tc_gact.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index 592a6bc02b0b..04a31830711b 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -2,6 +2,7 @@ #define __NET_TC_GACT_H #include +#include struct tcf_gact { struct tcf_common common; @@ -15,4 +16,19 @@ struct tcf_gact { #define to_gact(a) \ container_of(a->priv, struct tcf_gact, common) +#ifdef CONFIG_NET_CLS_ACT +static inline bool is_tcf_gact_shot(const struct tc_action *a) +{ + struct tcf_gact *gact; + + if (a->ops && a->ops->type != TCA_ACT_GACT) + return false; + + gact = a->priv; + if (gact->tcf_action == TC_ACT_SHOT) + return true; + + return false; +} +#endif #endif /* __NET_TC_GACT_H */ -- cgit v1.2.3 From 1cd4d5c4326a7ed3bb0e346bd7d20f5057a80ae6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Feb 2016 14:28:05 +0800 Subject: sctp: remove the unused sctp_datamsg_free() Since commit 8b570dc9f7b6 ("sctp: only drop the reference on the datamsg after sending a msg") used sctp_datamsg_put in sctp_sendmsg, instead of sctp_datamsg_free, this function has no use in sctp. So we will remove it. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 205630bb5010..d05b56641abc 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -535,7 +535,6 @@ struct sctp_datamsg { struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *, struct sctp_sndrcvinfo *, struct iov_iter *); -void sctp_datamsg_free(struct sctp_datamsg *); void sctp_datamsg_put(struct sctp_datamsg *); void sctp_chunk_fail(struct sctp_chunk *, int error); int sctp_chunk_abandoned(struct sctp_chunk *); -- cgit v1.2.3 From fc48b7a6148af974b49db145812a8b060324a503 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 15 Feb 2016 13:22:35 -0500 Subject: qed/qede: use 8.7.3.0 FW. This patch moves the qed* driver into utilizing the 8.7.3.0 FW. This new FW is required for a lot of new SW features, including: - Vlan filtering offload - Encapsulation offload support - HW ingress aggregations As well as paving the way for the possibility of adding storage protocols in the future. V2: - Fix kbuild test robot error/warnings. Signed-off-by: Yuval Mintz Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Manish Chopra Signed-off-by: David S. Miller --- include/linux/qed/common_hsi.h | 36 ++++++++- include/linux/qed/eth_common.h | 171 ++++++++++++++++++++++++++++++++--------- include/linux/qed/qed_if.h | 8 +- 3 files changed, 173 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 1d1ba2c5ee7a..53ecb37ae563 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -11,9 +11,11 @@ #define CORE_SPQE_PAGE_SIZE_BYTES 4096 +#define X_FINAL_CLEANUP_AGG_INT 1 + #define FW_MAJOR_VERSION 8 -#define FW_MINOR_VERSION 4 -#define FW_REVISION_VERSION 2 +#define FW_MINOR_VERSION 7 +#define FW_REVISION_VERSION 3 #define FW_ENGINEERING_VERSION 0 /***********************/ @@ -152,6 +154,9 @@ /* number of queues in a PF queue group */ #define QM_PF_QUEUE_GROUP_SIZE 8 +/* the size of a single queue element in bytes */ +#define QM_PQ_ELEMENT_SIZE 4 + /* base number of Tx PQs in the CM PQ representation. * should be used when storing PQ IDs in CM PQ registers and context */ @@ -285,6 +290,16 @@ #define PXP_NUM_ILT_RECORDS_K2 11000 #define MAX_NUM_ILT_RECORDS MAX(PXP_NUM_ILT_RECORDS_BB, PXP_NUM_ILT_RECORDS_K2) +#define SDM_COMP_TYPE_NONE 0 +#define SDM_COMP_TYPE_WAKE_THREAD 1 +#define SDM_COMP_TYPE_AGG_INT 2 +#define SDM_COMP_TYPE_CM 3 +#define SDM_COMP_TYPE_LOADER 4 +#define SDM_COMP_TYPE_PXP 5 +#define SDM_COMP_TYPE_INDICATE_ERROR 6 +#define SDM_COMP_TYPE_RELEASE_THREAD 7 +#define SDM_COMP_TYPE_RAM 8 + /******************/ /* PBF CONSTANTS */ /******************/ @@ -335,7 +350,7 @@ struct event_ring_entry { /* Multi function mode */ enum mf_mode { - SF, + ERROR_MODE /* Unsupported mode */, MF_OVLAN, MF_NPAR, MAX_MF_MODE @@ -606,4 +621,19 @@ struct status_block { #define STATUS_BLOCK_ZERO_PAD3_SHIFT 24 }; +struct tunnel_parsing_flags { + u8 flags; +#define TUNNEL_PARSING_FLAGS_TYPE_MASK 0x3 +#define TUNNEL_PARSING_FLAGS_TYPE_SHIFT 0 +#define TUNNEL_PARSING_FLAGS_TENNANT_ID_EXIST_MASK 0x1 +#define TUNNEL_PARSING_FLAGS_TENNANT_ID_EXIST_SHIFT 2 +#define TUNNEL_PARSING_FLAGS_NEXT_PROTOCOL_MASK 0x3 +#define TUNNEL_PARSING_FLAGS_NEXT_PROTOCOL_SHIFT 3 +#define TUNNEL_PARSING_FLAGS_FIRSTHDRIPMATCH_MASK 0x1 +#define TUNNEL_PARSING_FLAGS_FIRSTHDRIPMATCH_SHIFT 5 +#define TUNNEL_PARSING_FLAGS_IPV4_FRAGMENT_MASK 0x1 +#define TUNNEL_PARSING_FLAGS_IPV4_FRAGMENT_SHIFT 6 +#define TUNNEL_PARSING_FLAGS_IPV4_OPTIONS_MASK 0x1 +#define TUNNEL_PARSING_FLAGS_IPV4_OPTIONS_SHIFT 7 +}; #endif /* __COMMON_HSI__ */ diff --git a/include/linux/qed/eth_common.h b/include/linux/qed/eth_common.h index 320b3373ac1d..092cb0c1afcb 100644 --- a/include/linux/qed/eth_common.h +++ b/include/linux/qed/eth_common.h @@ -17,10 +17,8 @@ #define ETH_MAX_RAMROD_PER_CON 8 #define ETH_TX_BD_PAGE_SIZE_BYTES 4096 #define ETH_RX_BD_PAGE_SIZE_BYTES 4096 -#define ETH_RX_SGE_PAGE_SIZE_BYTES 4096 #define ETH_RX_CQE_PAGE_SIZE_BYTES 4096 #define ETH_RX_NUM_NEXT_PAGE_BDS 2 -#define ETH_RX_NUM_NEXT_PAGE_SGES 2 #define ETH_TX_MIN_BDS_PER_NON_LSO_PKT 1 #define ETH_TX_MAX_BDS_PER_NON_LSO_PACKET 18 @@ -34,7 +32,8 @@ #define ETH_NUM_STATISTIC_COUNTERS MAX_NUM_VPORTS -#define ETH_REG_CQE_PBL_SIZE 3 +/* Maximum number of buffers, used for RX packet placement */ +#define ETH_RX_MAX_BUFF_PER_PKT 5 /* num of MAC/VLAN filters */ #define ETH_NUM_MAC_FILTERS 512 @@ -54,9 +53,9 @@ /* TPA constants */ #define ETH_TPA_MAX_AGGS_NUM 64 -#define ETH_TPA_CQE_START_SGL_SIZE 3 -#define ETH_TPA_CQE_CONT_SGL_SIZE 6 -#define ETH_TPA_CQE_END_SGL_SIZE 4 +#define ETH_TPA_CQE_START_LEN_LIST_SIZE ETH_RX_MAX_BUFF_PER_PKT +#define ETH_TPA_CQE_CONT_LEN_LIST_SIZE 6 +#define ETH_TPA_CQE_END_LEN_LIST_SIZE 4 /* Queue Zone sizes */ #define TSTORM_QZONE_SIZE 0 @@ -74,18 +73,18 @@ struct coalescing_timeset { struct eth_tx_1st_bd_flags { u8 bitfields; +#define ETH_TX_1ST_BD_FLAGS_START_BD_MASK 0x1 +#define ETH_TX_1ST_BD_FLAGS_START_BD_SHIFT 0 #define ETH_TX_1ST_BD_FLAGS_FORCE_VLAN_MODE_MASK 0x1 -#define ETH_TX_1ST_BD_FLAGS_FORCE_VLAN_MODE_SHIFT 0 +#define ETH_TX_1ST_BD_FLAGS_FORCE_VLAN_MODE_SHIFT 1 #define ETH_TX_1ST_BD_FLAGS_IP_CSUM_MASK 0x1 -#define ETH_TX_1ST_BD_FLAGS_IP_CSUM_SHIFT 1 +#define ETH_TX_1ST_BD_FLAGS_IP_CSUM_SHIFT 2 #define ETH_TX_1ST_BD_FLAGS_L4_CSUM_MASK 0x1 -#define ETH_TX_1ST_BD_FLAGS_L4_CSUM_SHIFT 2 +#define ETH_TX_1ST_BD_FLAGS_L4_CSUM_SHIFT 3 #define ETH_TX_1ST_BD_FLAGS_VLAN_INSERTION_MASK 0x1 -#define ETH_TX_1ST_BD_FLAGS_VLAN_INSERTION_SHIFT 3 +#define ETH_TX_1ST_BD_FLAGS_VLAN_INSERTION_SHIFT 4 #define ETH_TX_1ST_BD_FLAGS_LSO_MASK 0x1 -#define ETH_TX_1ST_BD_FLAGS_LSO_SHIFT 4 -#define ETH_TX_1ST_BD_FLAGS_START_BD_MASK 0x1 -#define ETH_TX_1ST_BD_FLAGS_START_BD_SHIFT 5 +#define ETH_TX_1ST_BD_FLAGS_LSO_SHIFT 5 #define ETH_TX_1ST_BD_FLAGS_TUNN_IP_CSUM_MASK 0x1 #define ETH_TX_1ST_BD_FLAGS_TUNN_IP_CSUM_SHIFT 6 #define ETH_TX_1ST_BD_FLAGS_TUNN_L4_CSUM_MASK 0x1 @@ -97,38 +96,44 @@ struct eth_tx_data_1st_bd { __le16 vlan; u8 nbds; struct eth_tx_1st_bd_flags bd_flags; - __le16 fw_use_only; + __le16 bitfields; +#define ETH_TX_DATA_1ST_BD_TUNN_CFG_OVERRIDE_MASK 0x1 +#define ETH_TX_DATA_1ST_BD_TUNN_CFG_OVERRIDE_SHIFT 0 +#define ETH_TX_DATA_1ST_BD_RESERVED0_MASK 0x1 +#define ETH_TX_DATA_1ST_BD_RESERVED0_SHIFT 1 +#define ETH_TX_DATA_1ST_BD_FW_USE_ONLY_MASK 0x3FFF +#define ETH_TX_DATA_1ST_BD_FW_USE_ONLY_SHIFT 2 }; /* The parsing information data for the second tx bd of a given packet. */ struct eth_tx_data_2nd_bd { __le16 tunn_ip_size; - __le16 bitfields; -#define ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_MASK 0x1FFF -#define ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_SHIFT 0 -#define ETH_TX_DATA_2ND_BD_RESERVED0_MASK 0x7 -#define ETH_TX_DATA_2ND_BD_RESERVED0_SHIFT 13 - __le16 bitfields2; + __le16 bitfields1; #define ETH_TX_DATA_2ND_BD_TUNN_INNER_L2_HDR_SIZE_W_MASK 0xF #define ETH_TX_DATA_2ND_BD_TUNN_INNER_L2_HDR_SIZE_W_SHIFT 0 #define ETH_TX_DATA_2ND_BD_TUNN_INNER_ETH_TYPE_MASK 0x3 #define ETH_TX_DATA_2ND_BD_TUNN_INNER_ETH_TYPE_SHIFT 4 #define ETH_TX_DATA_2ND_BD_DEST_PORT_MODE_MASK 0x3 #define ETH_TX_DATA_2ND_BD_DEST_PORT_MODE_SHIFT 6 +#define ETH_TX_DATA_2ND_BD_START_BD_MASK 0x1 +#define ETH_TX_DATA_2ND_BD_START_BD_SHIFT 8 #define ETH_TX_DATA_2ND_BD_TUNN_TYPE_MASK 0x3 -#define ETH_TX_DATA_2ND_BD_TUNN_TYPE_SHIFT 8 +#define ETH_TX_DATA_2ND_BD_TUNN_TYPE_SHIFT 9 #define ETH_TX_DATA_2ND_BD_TUNN_INNER_IPV6_MASK 0x1 -#define ETH_TX_DATA_2ND_BD_TUNN_INNER_IPV6_SHIFT 10 +#define ETH_TX_DATA_2ND_BD_TUNN_INNER_IPV6_SHIFT 11 #define ETH_TX_DATA_2ND_BD_IPV6_EXT_MASK 0x1 -#define ETH_TX_DATA_2ND_BD_IPV6_EXT_SHIFT 11 +#define ETH_TX_DATA_2ND_BD_IPV6_EXT_SHIFT 12 #define ETH_TX_DATA_2ND_BD_TUNN_IPV6_EXT_MASK 0x1 -#define ETH_TX_DATA_2ND_BD_TUNN_IPV6_EXT_SHIFT 12 +#define ETH_TX_DATA_2ND_BD_TUNN_IPV6_EXT_SHIFT 13 #define ETH_TX_DATA_2ND_BD_L4_UDP_MASK 0x1 -#define ETH_TX_DATA_2ND_BD_L4_UDP_SHIFT 13 +#define ETH_TX_DATA_2ND_BD_L4_UDP_SHIFT 14 #define ETH_TX_DATA_2ND_BD_L4_PSEUDO_CSUM_MODE_MASK 0x1 -#define ETH_TX_DATA_2ND_BD_L4_PSEUDO_CSUM_MODE_SHIFT 14 -#define ETH_TX_DATA_2ND_BD_RESERVED1_MASK 0x1 -#define ETH_TX_DATA_2ND_BD_RESERVED1_SHIFT 15 +#define ETH_TX_DATA_2ND_BD_L4_PSEUDO_CSUM_MODE_SHIFT 15 + __le16 bitfields2; +#define ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_MASK 0x1FFF +#define ETH_TX_DATA_2ND_BD_L4_HDR_START_OFFSET_W_SHIFT 0 +#define ETH_TX_DATA_2ND_BD_RESERVED0_MASK 0x7 +#define ETH_TX_DATA_2ND_BD_RESERVED0_SHIFT 13 }; /* Regular ETH Rx FP CQE. */ @@ -145,11 +150,68 @@ struct eth_fast_path_rx_reg_cqe { struct parsing_and_err_flags pars_flags; __le16 vlan_tag; __le32 rss_hash; - __le16 len_on_bd; + __le16 len_on_first_bd; u8 placement_offset; - u8 reserved; - __le16 pbl[ETH_REG_CQE_PBL_SIZE]; - u8 reserved1[10]; + struct tunnel_parsing_flags tunnel_pars_flags; + u8 bd_num; + u8 reserved[7]; + u32 fw_debug; + u8 reserved1[3]; + u8 flags; +#define ETH_FAST_PATH_RX_REG_CQE_VALID_MASK 0x1 +#define ETH_FAST_PATH_RX_REG_CQE_VALID_SHIFT 0 +#define ETH_FAST_PATH_RX_REG_CQE_VALID_TOGGLE_MASK 0x1 +#define ETH_FAST_PATH_RX_REG_CQE_VALID_TOGGLE_SHIFT 1 +#define ETH_FAST_PATH_RX_REG_CQE_RESERVED2_MASK 0x3F +#define ETH_FAST_PATH_RX_REG_CQE_RESERVED2_SHIFT 2 +}; + +/* TPA-continue ETH Rx FP CQE. */ +struct eth_fast_path_rx_tpa_cont_cqe { + u8 type; + u8 tpa_agg_index; + __le16 len_list[ETH_TPA_CQE_CONT_LEN_LIST_SIZE]; + u8 reserved[5]; + u8 reserved1; + __le16 reserved2[ETH_TPA_CQE_CONT_LEN_LIST_SIZE]; +}; + +/* TPA-end ETH Rx FP CQE. */ +struct eth_fast_path_rx_tpa_end_cqe { + u8 type; + u8 tpa_agg_index; + __le16 total_packet_len; + u8 num_of_bds; + u8 end_reason; + __le16 num_of_coalesced_segs; + __le32 ts_delta; + __le16 len_list[ETH_TPA_CQE_END_LEN_LIST_SIZE]; + u8 reserved1[3]; + u8 reserved2; + __le16 reserved3[ETH_TPA_CQE_END_LEN_LIST_SIZE]; +}; + +/* TPA-start ETH Rx FP CQE. */ +struct eth_fast_path_rx_tpa_start_cqe { + u8 type; + u8 bitfields; +#define ETH_FAST_PATH_RX_TPA_START_CQE_RSS_HASH_TYPE_MASK 0x7 +#define ETH_FAST_PATH_RX_TPA_START_CQE_RSS_HASH_TYPE_SHIFT 0 +#define ETH_FAST_PATH_RX_TPA_START_CQE_TC_MASK 0xF +#define ETH_FAST_PATH_RX_TPA_START_CQE_TC_SHIFT 3 +#define ETH_FAST_PATH_RX_TPA_START_CQE_RESERVED0_MASK 0x1 +#define ETH_FAST_PATH_RX_TPA_START_CQE_RESERVED0_SHIFT 7 + __le16 seg_len; + struct parsing_and_err_flags pars_flags; + __le16 vlan_tag; + __le32 rss_hash; + __le16 len_on_first_bd; + u8 placement_offset; + struct tunnel_parsing_flags tunnel_pars_flags; + u8 tpa_agg_index; + u8 header_len; + __le16 ext_bd_len_list[ETH_TPA_CQE_START_LEN_LIST_SIZE]; + u32 fw_debug; }; /* The L4 pseudo checksum mode for Ethernet */ @@ -168,13 +230,26 @@ struct eth_slow_path_rx_cqe { u8 type; u8 ramrod_cmd_id; u8 error_flag; - u8 reserved[27]; + u8 reserved[25]; __le16 echo; + u8 reserved1; + u8 flags; +/* for PMD mode - valid indication */ +#define ETH_SLOW_PATH_RX_CQE_VALID_MASK 0x1 +#define ETH_SLOW_PATH_RX_CQE_VALID_SHIFT 0 +/* for PMD mode - valid toggle indication */ +#define ETH_SLOW_PATH_RX_CQE_VALID_TOGGLE_MASK 0x1 +#define ETH_SLOW_PATH_RX_CQE_VALID_TOGGLE_SHIFT 1 +#define ETH_SLOW_PATH_RX_CQE_RESERVED2_MASK 0x3F +#define ETH_SLOW_PATH_RX_CQE_RESERVED2_SHIFT 2 }; /* union for all ETH Rx CQE types */ union eth_rx_cqe { struct eth_fast_path_rx_reg_cqe fast_path_regular; + struct eth_fast_path_rx_tpa_start_cqe fast_path_tpa_start; + struct eth_fast_path_rx_tpa_cont_cqe fast_path_tpa_cont; + struct eth_fast_path_rx_tpa_end_cqe fast_path_tpa_end; struct eth_slow_path_rx_cqe slow_path; }; @@ -183,15 +258,18 @@ enum eth_rx_cqe_type { ETH_RX_CQE_TYPE_UNUSED, ETH_RX_CQE_TYPE_REGULAR, ETH_RX_CQE_TYPE_SLOW_PATH, + ETH_RX_CQE_TYPE_TPA_START, + ETH_RX_CQE_TYPE_TPA_CONT, + ETH_RX_CQE_TYPE_TPA_END, MAX_ETH_RX_CQE_TYPE }; /* ETH Rx producers data */ struct eth_rx_prod_data { __le16 bd_prod; - __le16 sge_prod; __le16 cqe_prod; __le16 reserved; + __le16 reserved1; }; /* The first tx bd of a given packet */ @@ -211,12 +289,17 @@ struct eth_tx_2nd_bd { /* The parsing information data for the third tx bd of a given packet. */ struct eth_tx_data_3rd_bd { __le16 lso_mss; - u8 bitfields; + __le16 bitfields; #define ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_MASK 0xF #define ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_SHIFT 0 #define ETH_TX_DATA_3RD_BD_HDR_NBD_MASK 0xF #define ETH_TX_DATA_3RD_BD_HDR_NBD_SHIFT 4 - u8 resereved0[3]; +#define ETH_TX_DATA_3RD_BD_START_BD_MASK 0x1 +#define ETH_TX_DATA_3RD_BD_START_BD_SHIFT 8 +#define ETH_TX_DATA_3RD_BD_RESERVED0_MASK 0x7F +#define ETH_TX_DATA_3RD_BD_RESERVED0_SHIFT 9 + u8 tunn_l4_hdr_start_offset_w; + u8 tunn_hdr_size_w; }; /* The third tx bd of a given packet */ @@ -226,12 +309,24 @@ struct eth_tx_3rd_bd { struct eth_tx_data_3rd_bd data; }; +/* Complementary information for the regular tx bd of a given packet. */ +struct eth_tx_data_bd { + __le16 reserved0; + __le16 bitfields; +#define ETH_TX_DATA_BD_RESERVED1_MASK 0xFF +#define ETH_TX_DATA_BD_RESERVED1_SHIFT 0 +#define ETH_TX_DATA_BD_START_BD_MASK 0x1 +#define ETH_TX_DATA_BD_START_BD_SHIFT 8 +#define ETH_TX_DATA_BD_RESERVED2_MASK 0x7F +#define ETH_TX_DATA_BD_RESERVED2_SHIFT 9 + __le16 reserved3; +}; + /* The common non-special TX BD ring element */ struct eth_tx_bd { struct regpair addr; __le16 nbytes; - __le16 reserved0; - __le32 reserved1; + struct eth_tx_data_bd data; }; union eth_tx_bd_types { diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index d4a32e878180..3d43c1d4ecef 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -80,7 +80,7 @@ struct qed_dev_info { u8 num_hwfns; u8 hw_mac[ETH_ALEN]; - bool is_mf; + bool is_mf_default; /* FW version */ u16 fw_major; @@ -360,6 +360,12 @@ enum DP_MODULE { /* to be added...up to 0x8000000 */ }; +enum qed_mf_mode { + QED_MF_DEFAULT, + QED_MF_OVLAN, + QED_MF_NPAR, +}; + struct qed_eth_stats { u64 no_buff_discards; u64 packet_too_big_discard; -- cgit v1.2.3 From 7bbf3cae65b6e438bf52033b63fdce4a86e89e17 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 15 Feb 2016 21:25:57 +0000 Subject: ipv4: Remove inet_lro library There are no longer any in-tree drivers that use it. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/inet_lro.h | 142 ----------------------------------------------- 1 file changed, 142 deletions(-) delete mode 100644 include/linux/inet_lro.h (limited to 'include') diff --git a/include/linux/inet_lro.h b/include/linux/inet_lro.h deleted file mode 100644 index 9a715cfa1fe3..000000000000 --- a/include/linux/inet_lro.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * linux/include/linux/inet_lro.h - * - * Large Receive Offload (ipv4 / tcp) - * - * (C) Copyright IBM Corp. 2007 - * - * Authors: - * Jan-Bernd Themann - * Christoph Raisch - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef __INET_LRO_H_ -#define __INET_LRO_H_ - -#include -#include - -/* - * LRO statistics - */ - -struct net_lro_stats { - unsigned long aggregated; - unsigned long flushed; - unsigned long no_desc; -}; - -/* - * LRO descriptor for a tcp session - */ -struct net_lro_desc { - struct sk_buff *parent; - struct sk_buff *last_skb; - struct skb_frag_struct *next_frag; - struct iphdr *iph; - struct tcphdr *tcph; - __wsum data_csum; - __be32 tcp_rcv_tsecr; - __be32 tcp_rcv_tsval; - __be32 tcp_ack; - u32 tcp_next_seq; - u32 skb_tot_frags_len; - u16 ip_tot_len; - u16 tcp_saw_tstamp; /* timestamps enabled */ - __be16 tcp_window; - int pkt_aggr_cnt; /* counts aggregated packets */ - int vlan_packet; - int mss; - int active; -}; - -/* - * Large Receive Offload (LRO) Manager - * - * Fields must be set by driver - */ - -struct net_lro_mgr { - struct net_device *dev; - struct net_lro_stats stats; - - /* LRO features */ - unsigned long features; -#define LRO_F_NAPI 1 /* Pass packets to stack via NAPI */ -#define LRO_F_EXTRACT_VLAN_ID 2 /* Set flag if VLAN IDs are extracted - from received packets and eth protocol - is still ETH_P_8021Q */ - - /* - * Set for generated SKBs that are not added to - * the frag list in fragmented mode - */ - u32 ip_summed; - u32 ip_summed_aggr; /* Set in aggregated SKBs: CHECKSUM_UNNECESSARY - * or CHECKSUM_NONE */ - - int max_desc; /* Max number of LRO descriptors */ - int max_aggr; /* Max number of LRO packets to be aggregated */ - - int frag_align_pad; /* Padding required to properly align layer 3 - * headers in generated skb when using frags */ - - struct net_lro_desc *lro_arr; /* Array of LRO descriptors */ - - /* - * Optimized driver functions - * - * get_skb_header: returns tcp and ip header for packet in SKB - */ - int (*get_skb_header)(struct sk_buff *skb, void **ip_hdr, - void **tcpudp_hdr, u64 *hdr_flags, void *priv); - - /* hdr_flags: */ -#define LRO_IPV4 1 /* ip_hdr is IPv4 header */ -#define LRO_TCP 2 /* tcpudp_hdr is TCP header */ - - /* - * get_frag_header: returns mac, tcp and ip header for packet in SKB - * - * @hdr_flags: Indicate what kind of LRO has to be done - * (IPv4/IPv6/TCP/UDP) - */ - int (*get_frag_header)(struct skb_frag_struct *frag, void **mac_hdr, - void **ip_hdr, void **tcpudp_hdr, u64 *hdr_flags, - void *priv); -}; - -/* - * Processes a SKB - * - * @lro_mgr: LRO manager to use - * @skb: SKB to aggregate - * @priv: Private data that may be used by driver functions - * (for example get_tcp_ip_hdr) - */ - -void lro_receive_skb(struct net_lro_mgr *lro_mgr, - struct sk_buff *skb, - void *priv); -/* - * Forward all aggregated SKBs held by lro_mgr to network stack - */ - -void lro_flush_all(struct net_lro_mgr *lro_mgr); - -#endif -- cgit v1.2.3 From e014860e31e2a66b1a94088504360a6ebc023564 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 17 Feb 2016 14:59:30 -0800 Subject: net: pack tc_cls_u32_knode struct slighter better By packing the structure we can remove a few holes as Jamal suggests. before: struct tc_cls_u32_knode { struct tcf_exts * exts; /* 0 8 */ u8 fshift; /* 8 1 */ /* XXX 3 bytes hole, try to pack */ u32 handle; /* 12 4 */ u32 val; /* 16 4 */ u32 mask; /* 20 4 */ u32 link_handle; /* 24 4 */ /* XXX 4 bytes hole, try to pack */ struct tc_u32_sel * sel; /* 32 8 */ /* size: 40, cachelines: 1, members: 7 */ /* sum members: 33, holes: 2, sum holes: 7 */ /* last cacheline: 40 bytes */ }; after: struct tc_cls_u32_knode { struct tcf_exts * exts; /* 0 8 */ struct tc_u32_sel * sel; /* 8 8 */ u32 handle; /* 16 4 */ u32 val; /* 20 4 */ u32 mask; /* 24 4 */ u32 link_handle; /* 28 4 */ u8 fshift; /* 32 1 */ /* size: 40, cachelines: 1, members: 7 */ /* padding: 7 */ /* last cacheline: 40 bytes */ }; Suggested-by: Jamal Hadi Salim Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 59789ca6e2c8..2121df574262 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -360,12 +360,12 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) struct tc_cls_u32_knode { struct tcf_exts *exts; - u8 fshift; + struct tc_u32_sel *sel; u32 handle; u32 val; u32 mask; u32 link_handle; - struct tc_u32_sel *sel; + u8 fshift; }; struct tc_cls_u32_hnode { -- cgit v1.2.3 From d4ac05ff3697e036dcb0e2e284c5f7eb77cc0966 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 16 Feb 2016 21:58:57 +0100 Subject: vxlan: introduce vxlan_hdr Currently, pointer to the vxlan header is kept in a local variable. It has to be reloaded whenever the pskb pull operations are performed which usually happens somewhere deep in called functions. Create a vxlan_hdr function and use it to reference the vxlan header instead. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/vxlan.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index b314e4af89c5..3f38b40ec4aa 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -262,6 +262,11 @@ static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, /* IPv6 header + UDP + VXLAN + Ethernet header */ #define VXLAN6_HEADROOM (40 + 8 + 8 + 14) +static inline struct vxlanhdr *vxlan_hdr(struct sk_buff *skb) +{ + return (struct vxlanhdr *)(udp_hdr(skb) + 1); +} + #if IS_ENABLED(CONFIG_VXLAN) void vxlan_get_rx_port(struct net_device *netdev); #else -- cgit v1.2.3 From 54bfd872bf16d40b61bd0cd9b769b2fef67dd272 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 16 Feb 2016 21:58:58 +0100 Subject: vxlan: keep flags and vni in network byte order Prevent repeated conversions from and to network order in the fast path. To achieve this, define all flag constants in big endian order and store VNI as __be32. To prevent confusion between the actual VNI value and the VNI field from the header (which contains additional reserved byte), strictly distinguish between "vni" and "vni_field". Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/vxlan.h | 70 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 3f38b40ec4aa..1b85a3b40c5a 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -24,11 +24,11 @@ struct vxlanhdr { }; /* VXLAN header flags. */ -#define VXLAN_HF_VNI BIT(27) +#define VXLAN_HF_VNI cpu_to_be32(BIT(27)) #define VXLAN_N_VID (1u << 24) #define VXLAN_VID_MASK (VXLAN_N_VID - 1) -#define VXLAN_VNI_MASK (VXLAN_VID_MASK << 8) +#define VXLAN_VNI_MASK cpu_to_be32(VXLAN_VID_MASK << 8) #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) #define VNI_HASH_BITS 10 @@ -55,14 +55,14 @@ struct vxlanhdr { */ /* VXLAN-RCO header flags. */ -#define VXLAN_HF_RCO BIT(21) +#define VXLAN_HF_RCO cpu_to_be32(BIT(21)) /* Remote checksum offload header option */ -#define VXLAN_RCO_MASK 0x7f /* Last byte of vni field */ -#define VXLAN_RCO_UDP 0x80 /* Indicate UDP RCO (TCP when not set *) */ -#define VXLAN_RCO_SHIFT 1 /* Left shift of start */ +#define VXLAN_RCO_MASK cpu_to_be32(0x7f) /* Last byte of vni field */ +#define VXLAN_RCO_UDP cpu_to_be32(0x80) /* Indicate UDP RCO (TCP when not set *) */ +#define VXLAN_RCO_SHIFT 1 /* Left shift of start */ #define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1) -#define VXLAN_MAX_REMCSUM_START (VXLAN_RCO_MASK << VXLAN_RCO_SHIFT) +#define VXLAN_MAX_REMCSUM_START (0x7f << VXLAN_RCO_SHIFT) /* * VXLAN Group Based Policy Extension (VXLAN_F_GBP): @@ -105,9 +105,9 @@ struct vxlanhdr_gbp { }; /* VXLAN-GBP header flags. */ -#define VXLAN_HF_GBP BIT(31) +#define VXLAN_HF_GBP cpu_to_be32(BIT(31)) -#define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | 0xFFFFFF) +#define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | cpu_to_be32(0xFFFFFF)) /* skb->mark mapping * @@ -144,7 +144,7 @@ union vxlan_addr { struct vxlan_rdst { union vxlan_addr remote_ip; __be16 remote_port; - u32 remote_vni; + __be32 remote_vni; u32 remote_ifindex; struct list_head list; struct rcu_head rcu; @@ -154,7 +154,7 @@ struct vxlan_rdst { struct vxlan_config { union vxlan_addr remote_ip; union vxlan_addr saddr; - u32 vni; + __be32 vni; int remote_ifindex; int mtu; __be16 dst_port; @@ -267,6 +267,54 @@ static inline struct vxlanhdr *vxlan_hdr(struct sk_buff *skb) return (struct vxlanhdr *)(udp_hdr(skb) + 1); } +static inline __be32 vxlan_vni(__be32 vni_field) +{ +#if defined(__BIG_ENDIAN) + return vni_field >> 8; +#else + return (vni_field & VXLAN_VNI_MASK) << 8; +#endif +} + +static inline __be32 vxlan_vni_field(__be32 vni) +{ +#if defined(__BIG_ENDIAN) + return vni << 8; +#else + return vni >> 8; +#endif +} + +static inline __be32 vxlan_tun_id_to_vni(__be64 tun_id) +{ +#if defined(__BIG_ENDIAN) + return tun_id; +#else + return tun_id >> 32; +#endif +} + +static inline size_t vxlan_rco_start(__be32 vni_field) +{ + return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; +} + +static inline size_t vxlan_rco_offset(__be32 vni_field) +{ + return (vni_field & VXLAN_RCO_UDP) ? + offsetof(struct udphdr, check) : + offsetof(struct tcphdr, check); +} + +static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset) +{ + __be32 vni_field = cpu_to_be32(start >> VXLAN_RCO_SHIFT); + + if (offset == offsetof(struct udphdr, check)) + vni_field |= VXLAN_RCO_UDP; + return vni_field; +} + #if IS_ENABLED(CONFIG_VXLAN) void vxlan_get_rx_port(struct net_device *netdev); #else -- cgit v1.2.3 From d1b4c689d4130bcfd3532680b64db562300716b6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Feb 2016 15:03:24 +0100 Subject: netlink: remove mmapped netlink support mmapped netlink has a number of unresolved issues: - TX zerocopy support had to be disabled more than a year ago via commit 4682a0358639b29cf ("netlink: Always copy on mmap TX.") because the content of the mmapped area can change after netlink attribute validation but before message processing. - RX support was implemented mainly to speed up nfqueue dumping packet payload to userspace. However, since commit ae08ce0021087a5d812d2 ("netfilter: nfnetlink_queue: zero copy support") we avoid one copy with the socket-based interface too (via the skb_zerocopy helper). The other problem is that skbs attached to mmaped netlink socket behave different from normal skbs: - they don't have a shinfo area, so all functions that use skb_shinfo() (e.g. skb_clone) cannot be used. - reserving headroom prevents userspace from seeing the content as it expects message to start at skb->head. See for instance commit aa3a022094fa ("netlink: not trim skb for mmaped socket when dump"). - skbs handed e.g. to netlink_ack must have non-NULL skb->sk, else we crash because it needs the sk to check if a tx ring is attached. Also not obvious, leads to non-intuitive bug fixes such as 7c7bdf359 ("netfilter: nfnetlink: use original skbuff when acking batches"). mmaped netlink also didn't play nicely with the skb_zerocopy helper used by nfqueue and openvswitch. Daniel Borkmann fixed this via commit 6bb0fef489f6 ("netlink, mmap: fix edge-case leakages in nf queue zero-copy")' but at the cost of also needing to provide remaining length to the allocation function. nfqueue also has problems when used with mmaped rx netlink: - mmaped netlink doesn't allow use of nfqueue batch verdict messages. Problem is that in the mmap case, the allocation time also determines the ordering in which the frame will be seen by userspace (A allocating before B means that A is located in earlier ring slot, but this also means that B might get a lower sequence number then A since seqno is decided later. To fix this we would need to extend the spinlocked region to also cover the allocation and message setup which isn't desirable. - nfqueue can now be configured to queue large (GSO) skbs to userspace. Queing GSO packets is faster than having to force a software segmentation in the kernel, so this is a desirable option. However, with a mmap based ring one has to use 64kb per ring slot element, else mmap has to fall back to the socket path (NL_MMAP_STATUS_COPY) for all large packets. To use the mmap interface, userspace not only has to probe for mmap netlink support, it also has to implement a recv/socket receive path in order to handle messages that exceed the size of an rx ring element. Cc: Daniel Borkmann Cc: Ken-ichirou MATSUZAWA Cc: Pablo Neira Ayuso Cc: Patrick McHardy Cc: Thomas Graf Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/uapi/linux/netlink.h | 4 ++++ include/uapi/linux/netlink_diag.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index f095155d8749..0dba4e4ed2be 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -107,8 +107,10 @@ struct nlmsgerr { #define NETLINK_PKTINFO 3 #define NETLINK_BROADCAST_ERROR 4 #define NETLINK_NO_ENOBUFS 5 +#ifndef __KERNEL__ #define NETLINK_RX_RING 6 #define NETLINK_TX_RING 7 +#endif #define NETLINK_LISTEN_ALL_NSID 8 #define NETLINK_LIST_MEMBERSHIPS 9 #define NETLINK_CAP_ACK 10 @@ -134,6 +136,7 @@ struct nl_mmap_hdr { __u32 nm_gid; }; +#ifndef __KERNEL__ enum nl_mmap_status { NL_MMAP_STATUS_UNUSED, NL_MMAP_STATUS_RESERVED, @@ -145,6 +148,7 @@ enum nl_mmap_status { #define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO #define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) #define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) +#endif #define NET_MAJOR 36 /* Major 36 is reserved for networking */ diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h index f2159d30d1f5..d79399394b46 100644 --- a/include/uapi/linux/netlink_diag.h +++ b/include/uapi/linux/netlink_diag.h @@ -48,6 +48,8 @@ enum { #define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */ #define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */ +#ifndef __KERNEL__ #define NDIAG_SHOW_RING_CFG 0x00000004 /* show ring configuration */ +#endif #endif -- cgit v1.2.3 From 263ea09084d172cac6e40459a690babe8de8e448 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Feb 2016 15:03:26 +0100 Subject: Revert "genl: Add genlmsg_new_unicast() for unicast message allocation" This reverts commit bb9b18fb55b0 ("genl: Add genlmsg_new_unicast() for unicast message allocation")'. Nothing wrong with it; its no longer needed since this was only for mmapped netlink support. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/genetlink.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 43c0e771f417..8d4608ce8716 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -83,7 +83,6 @@ struct genl_family { * @attrs: netlink attributes * @_net: network namespace * @user_ptr: user pointers - * @dst_sk: destination socket */ struct genl_info { u32 snd_seq; @@ -94,7 +93,6 @@ struct genl_info { struct nlattr ** attrs; possible_net_t _net; void * user_ptr[2]; - struct sock * dst_sk; }; static inline struct net *genl_info_net(struct genl_info *info) @@ -188,8 +186,6 @@ int genl_unregister_family(struct genl_family *family); void genl_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags); -struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info, - gfp_t flags); void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, struct genl_family *family, int flags, u8 cmd); -- cgit v1.2.3 From 905f0a739ad82c6371fb0cb0e71db14a750702ad Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Feb 2016 15:03:27 +0100 Subject: nfnetlink: remove nfnetlink_alloc_skb Following mmapped netlink removal this code can be simplified by removing the alloc wrapper. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netfilter/nfnetlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index ba0d9789eb6e..1d82dd5e9a08 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -34,8 +34,6 @@ int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); int nfnetlink_has_listeners(struct net *net, unsigned int group); -struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, - u32 dst_portid, gfp_t gfp_mask); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); -- cgit v1.2.3 From c5b0db3263b92526bc0c1b6380c0c99f91f069fc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Feb 2016 15:03:28 +0100 Subject: nfnetlink: Revert "nfnetlink: add support for memory mapped netlink" reverts commit 3ab1f683bf8b ("nfnetlink: add support for memory mapped netlink")' Like previous commits in the series, remove wrappers that are not needed after mmapped netlink removal. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netlink.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 0b41959aab9f..da14ab61f363 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -69,16 +69,6 @@ extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group) extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); extern int netlink_has_listeners(struct sock *sk, unsigned int group); -extern struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size, - unsigned int ldiff, u32 dst_portid, - gfp_t gfp_mask); -static inline struct sk_buff * -netlink_alloc_skb(struct sock *ssk, unsigned int size, u32 dst_portid, - gfp_t gfp_mask) -{ - return __netlink_alloc_skb(ssk, size, 0, dst_portid, gfp_mask); -} - extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); -- cgit v1.2.3 From 07dabf20d9867710b90b91108b2adcd448773e25 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 18 Feb 2016 19:19:29 +0100 Subject: vxlan: tun_id is 64bit, not 32bit The tun_id field in struct ip_tunnel_key is __be64, not __be32. We need to convert the vni to tun_id correctly. Fixes: 54bfd872bf16 ("vxlan: keep flags and vni in network byte order") Reported-by: Paolo Abeni Tested-by: Paolo Abeni Signed-off-by: Jiri Benc Acked-by: Thadeu Lima de Souza Cascardo Signed-off-by: David S. Miller --- include/net/vxlan.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 1b85a3b40c5a..748083de367a 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -294,6 +294,15 @@ static inline __be32 vxlan_tun_id_to_vni(__be64 tun_id) #endif } +static inline __be64 vxlan_vni_to_tun_id(__be32 vni) +{ +#if defined(__BIG_ENDIAN) + return (__be64)vni; +#else + return (__be64)vni << 32; +#endif +} + static inline size_t vxlan_rco_start(__be32 vni_field) { return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; -- cgit v1.2.3 From 7f290c94352e59b1d720055fce760a69a63bd0a1 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 18 Feb 2016 11:22:52 +0100 Subject: iptunnel: scrub packet in iptunnel_pull_header Part of skb_scrub_packet was open coded in iptunnel_pull_header. Let it call skb_scrub_packet directly instead. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 87408ab80856..4dd616376fec 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -270,7 +270,8 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, return INET_ECN_encapsulate(tos, inner); } -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, + bool xnet); void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, u8 tos, u8 ttl, __be16 df, bool xnet); -- cgit v1.2.3 From e550785c30f639b3cc6ca70c489a6463ff298453 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 17 Feb 2016 16:20:33 -0800 Subject: ipv6: Annotate change of locking mechanism for np->opt follows up commit 45f6fad84cc3 ("ipv6: add complete rcu protection around np->opt") which added mixed rcu/refcount protection to np->opt. Given the current implementation of rcu_pointer_handoff(), this has no effect at runtime. Signed-off-by: Benjamin Poirier Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ipv6.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 6570f379aba2..f3c9857c645d 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -259,8 +259,12 @@ static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) rcu_read_lock(); opt = rcu_dereference(np->opt); - if (opt && !atomic_inc_not_zero(&opt->refcnt)) - opt = NULL; + if (opt) { + if (!atomic_inc_not_zero(&opt->refcnt)) + opt = NULL; + else + opt = rcu_pointer_handoff(opt); + } rcu_read_unlock(); return opt; } -- cgit v1.2.3 From 9e74a6dadbbf31ac18a2712048bf866c8e32aab2 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 17 Feb 2016 11:23:55 -0800 Subject: net: Optimize local checksum offload This patch takes advantage of several assumptions we can make about the headers of the frame in order to reduce overall processing overhead for computing the outer header checksum. First we can assume the entire header is in the region pointed to by skb->head as this is what csum_start is based on. Second, as a result of our first assumption, we can just call csum_partial instead of making a call to skb_checksum which would end up having to configure things so that we could walk through the frags list. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/skbuff.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 39206751463e..89b536796e53 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3713,19 +3713,18 @@ static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb) */ static inline __wsum lco_csum(struct sk_buff *skb) { - char *inner_csum_field; - __wsum csum; + unsigned char *csum_start = skb_checksum_start(skb); + unsigned char *l4_hdr = skb_transport_header(skb); + __wsum partial; /* Start with complement of inner checksum adjustment */ - inner_csum_field = skb->data + skb_checksum_start_offset(skb) + - skb->csum_offset; - csum = ~csum_unfold(*(__force __sum16 *)inner_csum_field); + partial = ~csum_unfold(*(__force __sum16 *)(csum_start + + skb->csum_offset)); + /* Add in checksum of our headers (incl. outer checksum - * adjustment filled in by caller) + * adjustment filled in by caller) and return result. */ - csum = skb_checksum(skb, 0, skb_checksum_start_offset(skb), csum); - /* The result is the checksum from skb->data to end of packet */ - return csum; + return csum_partial(l4_hdr, csum_start - l4_hdr, partial); } #endif /* __KERNEL__ */ -- cgit v1.2.3 From 3f9b4a6972d50562613daa649ed064244e6bc7bb Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Thu, 18 Feb 2016 17:00:39 +0200 Subject: qed: Lay infrastructure for vlan filtering offload Today, interfaces are working in vlan-promisc mode; But once vlan filtering offloaded would be supported, we'll need a method to control it directly [e.g., when setting device to PROMISC, or when running out of vlan credits]. This adds the necessary API for L2 client to manually choose whether to accept all vlans or only those for which filters were configured. Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_eth_if.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index 81ab178e31c1..e53b0ca49e41 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -33,6 +33,8 @@ struct qed_update_vport_params { u8 vport_id; u8 update_vport_active_flg; u8 vport_active_flg; + u8 update_accept_any_vlan_flg; + u8 accept_any_vlan; u8 update_rss_flg; struct qed_update_vport_rss_params rss_params; }; -- cgit v1.2.3 From 2125715635053d4207a756a35aa718f548824e58 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 16 Feb 2016 12:46:54 +0100 Subject: bridge: mdb: add support for more attributes and export timer Currently mdb entries are exported directly as a structure inside MDBA_MDB_ENTRY_INFO attribute, we can't really extend it without breaking user-space. In order to export new mdb fields, I've converted the MDBA_MDB_ENTRY_INFO into a nested attribute which starts like before with struct br_mdb_entry (without header, as it's casted directly in iproute2) and continues with MDBA_MDB_EATTR_ attributes. This way we keep compatibility with older users and can export new data. I've tested this with iproute2, both with and without support for the added attribute and it works fine. So basically we again have MDBA_MDB_ENTRY_INFO with struct br_mdb_entry inside but it may contain also some additional MDBA_MDB_EATTR_ attributes such as MDBA_MDB_EATTR_TIMER which can be parsed by user-space. So the new structure is: [MDBA_MDB] = { [MDBA_MDB_ENTRY] = { [MDBA_MDB_ENTRY_INFO] [MDBA_MDB_ENTRY_INFO] { <- Nested attribute struct br_mdb_entry <- nla_put_nohdr() [MDBA_MDB_ENTRY attributes] <- normal netlink attributes } } } Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index ec3547234998..0890b217580d 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -137,7 +137,10 @@ struct bridge_vlan_info { /* Bridge multicast database attributes * [MDBA_MDB] = { * [MDBA_MDB_ENTRY] = { - * [MDBA_MDB_ENTRY_INFO] + * [MDBA_MDB_ENTRY_INFO] { + * struct br_mdb_entry + * [MDBA_MDB_EATTR attributes] + * } * } * } * [MDBA_ROUTER] = { @@ -166,6 +169,14 @@ enum { }; #define MDBA_MDB_ENTRY_MAX (__MDBA_MDB_ENTRY_MAX - 1) +/* per mdb entry additional attributes */ +enum { + MDBA_MDB_EATTR_UNSPEC, + MDBA_MDB_EATTR_TIMER, + __MDBA_MDB_EATTR_MAX +}; +#define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) + enum { MDBA_ROUTER_UNSPEC, MDBA_ROUTER_PORT, -- cgit v1.2.3 From e52bc7c28ac9f54db6f86b19ed65c599def18c98 Mon Sep 17 00:00:00 2001 From: David Decotigny Date: Fri, 19 Feb 2016 09:23:59 -0500 Subject: lib/bitmap.c: conversion routines to/from u32 array Aimed at transferring bitmaps to/from user-space in a 32/64-bit agnostic way. Tested: unit tests (next patch) on qemu i386, x86_64, ppc, ppc64 BE and LE, ARM. Signed-off-by: David Decotigny Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/bitmap.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 9653fdb76a42..e9b0b9ab07e5 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -59,6 +59,8 @@ * bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region * bitmap_release_region(bitmap, pos, order) Free specified bit region * bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region + * bitmap_from_u32array(dst, nbits, buf, nwords) *dst = *buf (nwords 32b words) + * bitmap_to_u32array(buf, nwords, src, nbits) *buf = *dst (nwords 32b words) */ /* @@ -163,6 +165,14 @@ extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); +extern unsigned int bitmap_from_u32array(unsigned long *bitmap, + unsigned int nbits, + const u32 *buf, + unsigned int nwords); +extern unsigned int bitmap_to_u32array(u32 *buf, + unsigned int nwords, + const unsigned long *bitmap, + unsigned int nbits); #ifdef __BIG_ENDIAN extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits); #else -- cgit v1.2.3 From ac2c7ad0e5d6030452c9af2fafd192e17fd04264 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 19 Feb 2016 09:24:01 -0500 Subject: net/ethtool: introduce a new ioctl for per queue setting Introduce a new ioctl ETHTOOL_PERQUEUE for per queue parameters setting. The following patches will enable some SUB_COMMANDs for per queue setting. Signed-off-by: Kan Liang Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 190aea0faaf4..f15ae02621a1 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1202,6 +1202,21 @@ enum ethtool_sfeatures_retval_bits { #define ETHTOOL_F_WISH (1 << ETHTOOL_F_WISH__BIT) #define ETHTOOL_F_COMPAT (1 << ETHTOOL_F_COMPAT__BIT) +#define MAX_NUM_QUEUE 4096 + +/** + * struct ethtool_per_queue_op - apply sub command to the queues in mask. + * @cmd: ETHTOOL_PERQUEUE + * @sub_command: the sub command which apply to each queues + * @queue_mask: Bitmap of the queues which sub command apply to + * @data: A complete command structure following for each of the queues addressed + */ +struct ethtool_per_queue_op { + __u32 cmd; + __u32 sub_command; + __u32 queue_mask[DIV_ROUND_UP(MAX_NUM_QUEUE, 32)]; + char data[]; +}; /* CMDs currently supported */ #define ETHTOOL_GSET 0x00000001 /* Get settings. */ @@ -1285,6 +1300,8 @@ enum ethtool_sfeatures_retval_bits { #define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ #define ETHTOOL_GPHYSTATS 0x0000004a /* get PHY-specific statistics */ +#define ETHTOOL_PERQUEUE 0x0000004b /* Set per queue options */ + /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET #define SPARC_ETH_SSET ETHTOOL_SSET -- cgit v1.2.3 From 421797b1aa363cb897f29f7d365e068dc9d9db81 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 19 Feb 2016 09:24:02 -0500 Subject: net/ethtool: support get coalesce per queue This patch implements sub command ETHTOOL_GCOALESCE for ioctl ETHTOOL_PERQUEUE. It introduces an interface get_per_queue_coalesce to get coalesce of each masked queue from device driver. Then the interrupt coalescing parameters will be copied back to user space one by one. Signed-off-by: Kan Liang Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/ethtool.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 653dc9c4ebac..de56600023a7 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -201,6 +201,11 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) * @get_module_eeprom: Get the eeprom information from the plug-in module * @get_eee: Get Energy-Efficient (EEE) supported and status. * @set_eee: Set EEE status (enable/disable) as well as LPI timers. + * @get_per_queue_coalesce: Get interrupt coalescing parameters per queue. + * It must check that the given queue number is valid. If neither a RX nor + * a TX queue has this number, return -EINVAL. If only a RX queue or a TX + * queue has this number, set the inapplicable fields to ~0 and return 0. + * Returns a negative error code or zero. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -279,7 +284,8 @@ struct ethtool_ops { const struct ethtool_tunable *, void *); int (*set_tunable)(struct net_device *, const struct ethtool_tunable *, const void *); - + int (*get_per_queue_coalesce)(struct net_device *, u32, + struct ethtool_coalesce *); }; #endif /* _LINUX_ETHTOOL_H */ -- cgit v1.2.3 From f38d138a7da6510a1184e3bc5f425deb187c3265 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 19 Feb 2016 09:24:03 -0500 Subject: net/ethtool: support set coalesce per queue This patch implements sub command ETHTOOL_SCOALESCE for ioctl ETHTOOL_PERQUEUE. It introduces an interface set_per_queue_coalesce to set coalesce of each masked queue to device driver. The wanted coalesce information are stored in "data" for each masked queue, which can copy from userspace. If it fails to set coalesce to device driver, the value which already set to specific queue will be tried to rollback. Signed-off-by: Kan Liang Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/ethtool.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index de56600023a7..472d7d7b01c2 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -206,6 +206,11 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) * a TX queue has this number, return -EINVAL. If only a RX queue or a TX * queue has this number, set the inapplicable fields to ~0 and return 0. * Returns a negative error code or zero. + * @set_per_queue_coalesce: Set interrupt coalescing parameters per queue. + * It must check that the given queue number is valid. If neither a RX nor + * a TX queue has this number, return -EINVAL. If only a RX queue or a TX + * queue has this number, ignore the inapplicable fields. + * Returns a negative error code or zero. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -286,6 +291,8 @@ struct ethtool_ops { const struct ethtool_tunable *, const void *); int (*get_per_queue_coalesce)(struct net_device *, u32, struct ethtool_coalesce *); + int (*set_per_queue_coalesce)(struct net_device *, u32, + struct ethtool_coalesce *); }; #endif /* _LINUX_ETHTOOL_H */ -- cgit v1.2.3 From 568b329a02f75ed3aaae5eb2cca384cb9e09cb29 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Feb 2016 19:58:57 -0800 Subject: perf: generalize perf_callchain . avoid walking the stack when there is no room left in the buffer . generalize get_perf_callchain() to be called from bpf helper Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/perf_event.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b35a61a481fa..7da3c25999df 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -964,11 +964,20 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs); +extern struct perf_callchain_entry * +get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + bool crosstask, bool add_mark); +extern int get_callchain_buffers(void); +extern void put_callchain_buffers(void); -static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) +static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) { - if (entry->nr < PERF_MAX_STACK_DEPTH) + if (entry->nr < PERF_MAX_STACK_DEPTH) { entry->ip[entry->nr++] = ip; + return 0; + } else { + return -1; /* no more room, stop walking the stack */ + } } extern int sysctl_perf_event_paranoid; -- cgit v1.2.3 From d5a3b1f691865be576c2bffa708549b8cdccda19 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Feb 2016 19:58:58 -0800 Subject: bpf: introduce BPF_MAP_TYPE_STACK_TRACE add new map type to store stack traces and corresponding helper bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id @ctx: struct pt_regs* @map: pointer to stack_trace map @flags: bits 0-7 - numer of stack frames to skip bit 8 - collect user stack instead of kernel bit 9 - compare stacks by hash only bit 10 - if two different stacks hash into the same stackid discard old other bits - reserved Return: >= 0 stackid on success or negative error stackid is a 32-bit integer handle that can be further combined with other data (including other stackid) and used as a key into maps. Userspace will access stackmap using standard lookup/delete syscall commands to retrieve full stack trace for given stackid. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 90ee6ab24bc5..0cadbb7456c0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -237,6 +237,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_skb_vlan_push_proto; extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; +extern const struct bpf_func_proto bpf_get_stackid_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2ee0fde1bf96..d3e77da8e9e8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -83,6 +83,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_MAP_TYPE_PERCPU_HASH, BPF_MAP_TYPE_PERCPU_ARRAY, + BPF_MAP_TYPE_STACK_TRACE, }; enum bpf_prog_type { @@ -272,6 +273,20 @@ enum bpf_func_id { */ BPF_FUNC_perf_event_output, BPF_FUNC_skb_load_bytes, + + /** + * bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id + * @ctx: struct pt_regs* + * @map: pointer to stack_trace map + * @flags: bits 0-7 - numer of stack frames to skip + * bit 8 - collect user stack instead of kernel + * bit 9 - compare stacks by hash only + * bit 10 - if two different stacks hash into the same stackid + * discard old + * other bits - reserved + * Return: >= 0 stackid on success or negative error + */ + BPF_FUNC_get_stackid, __BPF_FUNC_MAX_ID, }; @@ -294,6 +309,12 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ #define BPF_F_TUNINFO_IPV6 (1ULL << 0) +/* BPF_FUNC_get_stackid flags. */ +#define BPF_F_SKIP_FIELD_MASK 0xffULL +#define BPF_F_USER_STACK (1ULL << 8) +#define BPF_F_FAST_STACK_CMP (1ULL << 9) +#define BPF_F_REUSE_STACKID (1ULL << 10) + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ -- cgit v1.2.3 From 745041e2aaf1d668f293aaab4b0f6ad7daa056a5 Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Fri, 19 Feb 2016 09:43:16 +0000 Subject: lwtunnel: autoload of lwt modules The lwt implementations using net devices can autoload using the existing mechanism using IFLA_INFO_KIND. However, there's no mechanism that lwt modules not using net devices can use. Therefore, add the ability to autoload modules registering lwt operations for lwt implementations not using a net device so that users don't have to manually load the modules. Only users with the CAP_NET_ADMIN capability can cause modules to be loaded, which is ensured by rtnetlink_rcv_msg rejecting non-RTM_GETxxx messages for users without this capability, and by lwtunnel_build_state not being called in response to RTM_GETxxx messages. Signed-off-by: Robert Shearman Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 66350ce3e955..e9f116e29c22 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -170,6 +170,8 @@ static inline int lwtunnel_input(struct sk_buff *skb) return -EOPNOTSUPP; } -#endif +#endif /* CONFIG_LWTUNNEL */ + +#define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type)) #endif /* __NET_LWTUNNEL_H */ -- cgit v1.2.3 From 6ceb31ca5f65acff299dbc3da5854d54e147b7d8 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 19 Feb 2016 11:26:31 -0800 Subject: VXLAN: Support outer IPv4 Tx checksums by default This change makes it so that if UDP CSUM is not specified we will default to enabling it. The main motivation behind this is the fact that with the use of outer checksum we can greatly improve the performance for VXLAN tunnels on devices that don't know how to parse tunnel headers. Signed-off-by: Alexander Duyck Acked-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/vxlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 748083de367a..6eda4ed4d78b 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -197,7 +197,7 @@ struct vxlan_dev { #define VXLAN_F_L2MISS 0x08 #define VXLAN_F_L3MISS 0x10 #define VXLAN_F_IPV6 0x20 -#define VXLAN_F_UDP_CSUM 0x40 +#define VXLAN_F_UDP_ZERO_CSUM_TX 0x40 #define VXLAN_F_UDP_ZERO_CSUM6_TX 0x80 #define VXLAN_F_UDP_ZERO_CSUM6_RX 0x100 #define VXLAN_F_REMCSUM_TX 0x200 -- cgit v1.2.3 From 8e2fe1d9f1a20924f98ea46931a1d7fb092aa876 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 19 Feb 2016 23:05:22 +0100 Subject: bpf: add new arg_type that allows for 0 sized stack buffer Currently, when we pass a buffer from the eBPF stack into a helper function, the function proto indicates argument types as ARG_PTR_TO_STACK and ARG_CONST_STACK_SIZE pair. If R contains the former, then R must be of the latter type. Then, verifier checks whether the buffer points into eBPF stack, is initialized, etc. The verifier also guarantees that the constant value passed in R is greater than 0, so helper functions don't need to test for it and can always assume a non-NULL initialized buffer as well as non-0 buffer size. This patch adds a new argument types ARG_CONST_STACK_SIZE_OR_ZERO that allows to also pass NULL as R and 0 as R into the helper function. Such helper functions, of course, need to be able to handle these cases internally then. Verifier guarantees that either R == NULL && R == 0 or R != NULL && R != 0 (like the case of ARG_CONST_STACK_SIZE), any other combinations are not possible to load. I went through various options of extending the verifier, and introducing the type ARG_CONST_STACK_SIZE_OR_ZERO seems to have most minimal changes needed to the verifier. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0cadbb7456c0..51e498e5470e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -65,6 +65,7 @@ enum bpf_arg_type { */ ARG_PTR_TO_STACK, /* any pointer to eBPF program stack */ ARG_CONST_STACK_SIZE, /* number of bytes accessed from stack */ + ARG_CONST_STACK_SIZE_OR_ZERO, /* number of bytes accessed from stack or 0 */ ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ -- cgit v1.2.3 From 7d672345ed295b1356a5d9f7111da1d1d7d65867 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 19 Feb 2016 23:05:23 +0100 Subject: bpf: add generic bpf_csum_diff helper For L4 checksums, we currently have bpf_l4_csum_replace() helper. It's currently limited to handle 2 and 4 byte changes in a header and feeds the from/to into inet_proto_csum_replace{2,4}() helpers of the kernel. When working with IPv6, for example, this makes it rather cumbersome to deal with, similarly when editing larger parts of a header. Instead, extend the API in a more generic way: For bpf_l4_csum_replace(), add a case for header field mask of 0 to change the checksum at a given offset through inet_proto_csum_replace_by_diff(), and provide a helper bpf_csum_diff() that can generically calculate a from/to diff for arbitrary amounts of data. This can be used in multiple ways: for the bpf_l4_csum_replace() only part, this even provides us with the option to insert precalculated diffs from user space f.e. from a map, or from bpf_csum_diff() during runtime. bpf_csum_diff() has a optional from/to stack buffer input, so we can calculate a diff by using a scratchbuffer for scenarios where we're inserting (from is NULL), removing (to is NULL) or diffing (from/to buffers don't need to be of equal size) data. Also, bpf_csum_diff() allows to feed a previous csum into csum_partial(), so the function can also be cascaded. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d3e77da8e9e8..48d0a6c54609 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -287,6 +287,17 @@ enum bpf_func_id { * Return: >= 0 stackid on success or negative error */ BPF_FUNC_get_stackid, + + /** + * bpf_csum_diff(from, from_size, to, to_size, seed) - calculate csum diff + * @from: raw from buffer + * @from_size: length of from buffer + * @to: raw to buffer + * @to_size: length of to buffer + * @seed: optional seed + * Return: csum result + */ + BPF_FUNC_csum_diff, __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From 3697649ff29e0f647565eed04b27a7779c646a22 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 19 Feb 2016 23:05:25 +0100 Subject: bpf: try harder on clones when writing into skb When we're dealing with clones and the area is not writeable, try harder and get a copy via pskb_expand_head(). Replace also other occurences in tc actions with the new skb_try_make_writable(). Reported-by: Ashhad Sheikh Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 89b536796e53..6a57757a86cf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2630,6 +2630,13 @@ static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len skb_headroom(skb) + len <= skb->hdr_len; } +static inline int skb_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + return skb_cloned(skb) && !skb_clone_writable(skb, write_len) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} + static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom, int cloned) { -- cgit v1.2.3 From 2f72959a9c1260ade234f353ccca91118151af66 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 19 Feb 2016 23:05:26 +0100 Subject: bpf: fix csum update in bpf_l4_csum_replace helper for udp When using this helper for updating UDP checksums, we need to extend this in order to write CSUM_MANGLED_0 for csum computations that result into 0 as sum. Reason we need this is because packets with a checksum could otherwise become incorrectly marked as a packet without a checksum. Likewise, if the user indicates BPF_F_MARK_MANGLED_0, then we should not turn packets without a checksum into ones with a checksum. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 48d0a6c54609..6496f98d3d68 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -313,6 +313,7 @@ enum bpf_func_id { /* BPF_FUNC_l4_csum_replace flags. */ #define BPF_F_PSEUDO_HDR (1ULL << 4) +#define BPF_F_MARK_MANGLED_0 (1ULL << 5) /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ #define BPF_F_INGRESS (1ULL << 0) -- cgit v1.2.3 From 944945986f125bdbbeaa78dac0c0eadb963eb34a Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Sun, 21 Feb 2016 11:40:10 +0200 Subject: qed: Introduce DMA_REGPAIR_LE FW hsi contains regpairs, mostly for 64-bit address representations. Since same paradigm is applied each time a regpair is filled, this introduces a new utility macro for setting such regpairs. Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_chain.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h index 41b9049b57e2..5f8fcaaa6504 100644 --- a/include/linux/qed/qed_chain.h +++ b/include/linux/qed/qed_chain.h @@ -19,6 +19,10 @@ /* dma_addr_t manip */ #define DMA_LO_LE(x) cpu_to_le32(lower_32_bits(x)) #define DMA_HI_LE(x) cpu_to_le32(upper_32_bits(x)) +#define DMA_REGPAIR_LE(x, val) do { \ + (x).hi = DMA_HI_LE((val)); \ + (x).lo = DMA_LO_LE((val)); \ + } while (0) #define HILO_GEN(hi, lo, type) ((((type)(hi)) << 32) + (lo)) #define HILO_DMA(hi, lo) HILO_GEN(hi, lo, dma_addr_t) -- cgit v1.2.3 From 6d5d2ee63cee7025badda3b74ae2ef7ab097acfa Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 8 Jan 2016 19:28:58 +0100 Subject: Bluetooth: add LED trigger for indicating HCI is powered up Add support for LED triggers to the Bluetooth subsystem and add kernel config symbol BT_LEDS for it. For now one trigger for indicating "HCI is powered up" is supported. Signed-off-by: Heiner Kallweit Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index d4f82edb5cff..dc71473462ac 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -25,6 +25,7 @@ #ifndef __HCI_CORE_H #define __HCI_CORE_H +#include #include #include @@ -396,6 +397,8 @@ struct hci_dev { struct delayed_work rpa_expired; bdaddr_t rpa; + struct led_trigger *power_led; + int (*open)(struct hci_dev *hdev); int (*close)(struct hci_dev *hdev); int (*flush)(struct hci_dev *hdev); -- cgit v1.2.3 From 07b0188adf7298bf80a9890d3e90f27e973623d3 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 19 Feb 2016 09:59:11 +0100 Subject: mac802154: fix mac header length check I got report about that sometimes the WARN_ON occurs there which should never happen. I came to the conclusion that the mac header is there but inside the headroom of skb. The skb->len information doesn't contain the information about the headroom length and skb->len is lesser than two. We check now if the skb_mac_header pointer is set and the room between mac header pointer and tail pointer. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/mac802154.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index da574bbdc333..2e3cdd2048d2 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -247,8 +247,9 @@ struct ieee802154_ops { */ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb) { - /* return some invalid fc on failure */ - if (unlikely(skb->len < 2)) { + /* check if we can fc at skb_mac_header of sk buffer */ + if (unlikely(!skb_mac_header_was_set(skb) || + (skb_tail_pointer(skb) - skb_mac_header(skb)) < 2)) { WARN_ON(1); return cpu_to_le16(0); } -- cgit v1.2.3 From 5609c185f24dffca5f6a9c127106869da150be03 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 22 Feb 2016 09:13:54 +0100 Subject: 6lowpan: iphc: add support for stateful compression This patch introduce support for IPHC stateful address compression. It will offer the context table via one debugfs entry. This debugfs has and directory for each cid entry for the context table. Inside each cid directory there exists the following files: - "active": If the entry is added or deleted. The context table is original a list implementation, this flag will indicate if the context is part of list or not. - "prefix": The ipv6 prefix. - "prefix_length": The prefix length for the prefix. - "compression": The compression flag according RFC6775. This part should be moved into sysfs after some testing time. Also the debugfs entry contains a "show" file which is a pretty-printout for the current context table information. Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/6lowpan.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h index 2f6a3f2233ed..da3a77d25fcb 100644 --- a/include/net/6lowpan.h +++ b/include/net/6lowpan.h @@ -75,6 +75,8 @@ #define LOWPAN_IPHC_MAX_HC_BUF_LEN (sizeof(struct ipv6hdr) + \ LOWPAN_IPHC_MAX_HEADER_LEN + \ LOWPAN_NHC_MAX_HDR_LEN) +/* SCI/DCI is 4 bit width, so we have maximum 16 entries */ +#define LOWPAN_IPHC_CTX_TABLE_SIZE (1 << 4) #define LOWPAN_DISPATCH_IPV6 0x41 /* 01000001 = 65 */ #define LOWPAN_DISPATCH_IPHC 0x60 /* 011xxxxx = ... */ @@ -98,9 +100,39 @@ enum lowpan_lltypes { LOWPAN_LLTYPE_IEEE802154, }; +enum lowpan_iphc_ctx_flags { + LOWPAN_IPHC_CTX_FLAG_ACTIVE, + LOWPAN_IPHC_CTX_FLAG_COMPRESSION, +}; + +struct lowpan_iphc_ctx { + u8 id; + struct in6_addr pfx; + u8 plen; + unsigned long flags; +}; + +struct lowpan_iphc_ctx_table { + spinlock_t lock; + const struct lowpan_iphc_ctx_ops *ops; + struct lowpan_iphc_ctx table[LOWPAN_IPHC_CTX_TABLE_SIZE]; +}; + +static inline bool lowpan_iphc_ctx_is_active(const struct lowpan_iphc_ctx *ctx) +{ + return test_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags); +} + +static inline bool +lowpan_iphc_ctx_is_compression(const struct lowpan_iphc_ctx *ctx) +{ + return test_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags); +} + struct lowpan_priv { enum lowpan_lltypes lltype; struct dentry *iface_debugfs; + struct lowpan_iphc_ctx_table ctx; /* must be last */ u8 priv[0] __aligned(sizeof(void *)); -- cgit v1.2.3 From a6692754d61a6b3735803783f394880805675f99 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 12 Feb 2016 12:09:39 -0500 Subject: net: dsa: pass bridge down to drivers Some DSA drivers may or may not support multiple software bridges on top of an hardware switch. It is more convenient for them to access the bridge's net_device for finer configuration. Removing the need to craft and access a bitmask also simplifies the code. This patch changes the signature of bridge related functions, update DSA drivers, and removes dsa_slave_br_port_mask. Signed-off-by: Vivien Didelot Tested-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 26a0e86e611e..1c845d7bf0b2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -297,9 +297,8 @@ struct dsa_switch_driver { * Bridge integration */ int (*port_join_bridge)(struct dsa_switch *ds, int port, - u32 br_port_mask); - int (*port_leave_bridge)(struct dsa_switch *ds, int port, - u32 br_port_mask); + struct net_device *bridge); + int (*port_leave_bridge)(struct dsa_switch *ds, int port); int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); -- cgit v1.2.3 From 412a6d800c7380c1b87c11080c7da905c27cfea8 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 8 Dec 2015 19:09:05 +0200 Subject: mac80211: support hw managing reorder logic Enable driver to manage the reordering logic itself. This is needed for example for the iwlwifi driver that will support hardware assisted reordering. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 6c9c559394b0..ee6305a52251 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1929,6 +1929,11 @@ struct ieee80211_txq { * by just its MAC address; this prevents, for example, the same station * from connecting to two virtual AP interfaces at the same time. * + * @IEEE80211_HW_SUPPORTS_REORDERING_BUFFER: Hardware (or driver) manages the + * reordering buffer internally, guaranteeing mac80211 receives frames in + * order and does not need to manage its own reorder buffer or BA session + * timeout. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -1965,6 +1970,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU, IEEE80211_HW_BEACON_TX_STATUS, IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR, + IEEE80211_HW_SUPPORTS_REORDERING_BUFFER, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS -- cgit v1.2.3 From 178830481eee5eea147a1c8fab67a96e09d80345 Mon Sep 17 00:00:00 2001 From: Grzegorz Bajorski Date: Fri, 11 Dec 2015 14:39:46 +0100 Subject: mac80211: allow drivers to report (non-)monitor frames Some drivers offload some frames internally (e.g. AddBa). Reporting such frames to mac80211 would only confuse MLME. However it would be useful to be able to pass such frames to monitor interfaces for sniffing purposes, e.g. when running AP + monitor. To do that allow drivers to tell mac80211 whether a given frame should be: - processed but not delivered to any monitor vif - not processed but delievered to monitor vifs only Signed-off-by: Grzegorz Bajorski Signed-off-by: Johannes Berg --- include/net/mac80211.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ee6305a52251..5910085af9e6 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1031,6 +1031,14 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_AMPDU_DELIM_CRC_KNOWN: The delimiter CRC field is known (the CRC * is stored in the @ampdu_delimiter_crc field) * @RX_FLAG_LDPC: LDPC was used + * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without + * processing it in any regular way. + * This is useful if drivers offload some frames but still want to report + * them for sniffing purposes. + * @RX_FLAG_SKIP_MONITOR: Process and report frame to all interfaces except + * monitor interfaces. + * This is useful if drivers offload some frames but still want to report + * them for sniffing purposes. * @RX_FLAG_STBC_MASK: STBC 2 bit bitmask. 1 - Nss=1, 2 - Nss=2, 3 - Nss=3 * @RX_FLAG_10MHZ: 10 MHz (half channel) was used * @RX_FLAG_5MHZ: 5 MHz (quarter channel) was used @@ -1071,6 +1079,8 @@ enum mac80211_rx_flags { RX_FLAG_MACTIME_END = BIT(21), RX_FLAG_VHT = BIT(22), RX_FLAG_LDPC = BIT(23), + RX_FLAG_ONLY_MONITOR = BIT(24), + RX_FLAG_SKIP_MONITOR = BIT(25), RX_FLAG_STBC_MASK = BIT(26) | BIT(27), RX_FLAG_10MHZ = BIT(28), RX_FLAG_5MHZ = BIT(29), @@ -1089,6 +1099,7 @@ enum mac80211_rx_flags { * @RX_VHT_FLAG_160MHZ: 160 MHz was used * @RX_VHT_FLAG_BF: packet was beamformed */ + enum mac80211_rx_vht_flags { RX_VHT_FLAG_80MHZ = BIT(0), RX_VHT_FLAG_160MHZ = BIT(1), -- cgit v1.2.3 From 506bcfa8abebdbcebdc17b03e96e38dc0b8ce765 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 13 Dec 2015 15:41:05 +0200 Subject: mac80211: limit the A-MSDU Tx based on peer's capabilities In VHT, the specification allows to limit the number of MSDUs in an A-MSDU in the Extended Capabilities IE. There is also a limitation on the byte size in the VHT IE. In HT, the only limitation is on the byte size. Parse the capabilities from the peer and make them available to the driver. In HT, there is another limitation when a BA agreement is active: the byte size can't be greater than 4095. This is not enforced here. Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 19 +++++++++++++++++++ include/net/mac80211.h | 14 ++++++++++++++ 2 files changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d9ddb89533a7..3b1f6cef9513 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -163,6 +163,14 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) /* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */ #define IEEE80211_MAX_FRAME_LEN 2352 +/* Maximal size of an A-MSDU */ +#define IEEE80211_MAX_MPDU_LEN_HT_3839 3839 +#define IEEE80211_MAX_MPDU_LEN_HT_7935 7935 + +#define IEEE80211_MAX_MPDU_LEN_VHT_3895 3895 +#define IEEE80211_MAX_MPDU_LEN_VHT_7991 7991 +#define IEEE80211_MAX_MPDU_LEN_VHT_11454 11454 + #define IEEE80211_MAX_SSID_LEN 32 #define IEEE80211_MAX_MESH_ID_LEN 32 @@ -1505,6 +1513,7 @@ struct ieee80211_vht_operation { #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 0x00000000 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991 0x00000001 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 0x00000002 +#define IEEE80211_VHT_CAP_MAX_MPDU_MASK 0x00000003 #define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ 0x00000004 #define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ 0x00000008 #define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK 0x0000000C @@ -2086,6 +2095,16 @@ enum ieee80211_tdls_actioncode { #define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED BIT(5) #define WLAN_EXT_CAPA8_OPMODE_NOTIF BIT(6) +/* Defines the maximal number of MSDUs in an A-MSDU. */ +#define WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB BIT(7) +#define WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB BIT(0) + +/* + * Fine Timing Measurement Initiator - bit 71 of @WLAN_EID_EXT_CAPABILITY + * information element + */ +#define WLAN_EXT_CAPA9_FTM_INITIATOR BIT(7) + /* TDLS specific payload type in the LLC/SNAP header */ #define WLAN_TDLS_SNAP_RFTYPE 0x2 diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 5910085af9e6..df5698ed8052 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1714,6 +1714,18 @@ struct ieee80211_sta_rates { * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only * valid if the STA is a TDLS peer in the first place. * @mfp: indicates whether the STA uses management frame protection or not. + * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single + * A-MSDU. Taken from the Extended Capabilities element. 0 means + * unlimited. + * @max_amsdu_len: indicates the maximal length of an A-MSDU in bytes. This + * field is always valid for packets with a VHT preamble. For packets + * with a HT preamble, additional limits apply: + * + If the skb is transmitted as part of a BA agreement, the + * A-MSDU maximal size is min(max_amsdu_len, 4065) bytes. + * + If the skb is not part of a BA aggreement, the A-MSDU maximal + * size is min(max_amsdu_len, 7935) bytes. + * Both additional HT limits must be enforced by the low level driver. + * This is defined by the spec (IEEE 802.11-2012 section 8.3.2.2 NOTE 2). * @txq: per-TID data TX queues (if driver uses the TXQ abstraction) */ struct ieee80211_sta { @@ -1732,6 +1744,8 @@ struct ieee80211_sta { bool tdls; bool tdls_initiator; bool mfp; + u8 max_amsdu_subframes; + u16 max_amsdu_len; struct ieee80211_txq *txq[IEEE80211_NUM_TIDS]; -- cgit v1.2.3 From 538dc9045251d3d6b5c0216a5c61c32bd9cedac9 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 24 Dec 2015 00:33:26 -0800 Subject: mac80211: Make addr const in SET_IEEE80211_PERM_ADDR() Make the addr parameter const in SET_IEEE80211_PERM_ADDR() to save clients from having to cast away a const qualifier. Signed-off-by: Bjorn Andersson Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index df5698ed8052..566df20dc957 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2217,7 +2217,7 @@ static inline void SET_IEEE80211_DEV(struct ieee80211_hw *hw, struct device *dev * @hw: the &struct ieee80211_hw to set the MAC address for * @addr: the address to set */ -static inline void SET_IEEE80211_PERM_ADDR(struct ieee80211_hw *hw, u8 *addr) +static inline void SET_IEEE80211_PERM_ADDR(struct ieee80211_hw *hw, const u8 *addr) { memcpy(hw->wiphy->perm_addr, addr, ETH_ALEN); } -- cgit v1.2.3 From dd21dfc645d5dce0657af78761b3fa11a3a95398 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 20 Jan 2016 10:39:23 +0100 Subject: rfkill: disentangle polling pause and suspend When suspended while polling is paused, polling will erroneously resume at resume time. Fix this by tracking pause and suspend in separate state variable and adding the necessary checks. Clarify the documentation on this as well. Signed-off-by: Johannes Berg --- include/linux/rfkill.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index d9010789b4e8..7af625f6d226 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -104,7 +104,8 @@ int __must_check rfkill_register(struct rfkill *rfkill); * * Pause polling -- say transmitter is off for other reasons. * NOTE: not necessary for suspend/resume -- in that case the - * core stops polling anyway + * core stops polling anyway (but will also correctly handle + * the case of polling having been paused before suspend.) */ void rfkill_pause_polling(struct rfkill *rfkill); -- cgit v1.2.3 From d4634e8dea13ccc969dd3f33dab3873cfdf3bc51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Paulo=20Rechi=20Vita?= Date: Tue, 19 Jan 2016 10:42:42 -0500 Subject: rfkill: Update userspace API documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a note to userspace on the effect of RFKILL_OP_CHANGE_ALL also updating the default state for hotplugged devices. Signed-off-by: João Paulo Rechi Vita [reword a bit] Signed-off-by: Johannes Berg --- include/uapi/linux/rfkill.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/rfkill.h b/include/uapi/linux/rfkill.h index 058757f7a733..2e00dcebebd0 100644 --- a/include/uapi/linux/rfkill.h +++ b/include/uapi/linux/rfkill.h @@ -59,6 +59,8 @@ enum rfkill_type { * @RFKILL_OP_DEL: a device was removed * @RFKILL_OP_CHANGE: a device's state changed -- userspace changes one device * @RFKILL_OP_CHANGE_ALL: userspace changes all devices (of a type, or all) + * into a state, also updating the default state used for devices that + * are hot-plugged later. */ enum rfkill_operation { RFKILL_OP_ADD = 0, -- cgit v1.2.3 From f4a0f0c5264e72d9279fbf9cf48a061526e8f788 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 25 Jan 2016 15:46:34 +0200 Subject: mac80211: add RX_FLAG_MACTIME_PLCP_START The timestamp given by iwlwifi is at the beginning of the frame over the air, at (or during) the SYNC field. Allow such timestamps to be given to mac80211, at least (for now) for frames with non-HT/VHT preambles. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 566df20dc957..31337f81ec03 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1010,6 +1010,8 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_MACTIME_END: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the last symbol of the MPDU * (including FCS) was received. + * @RX_FLAG_MACTIME_PLCP_START: The timestamp passed in the RX status (@mactime + * field) is valid and contains the time the SYNC preamble was received. * @RX_FLAG_SHORTPRE: Short preamble was used for this frame * @RX_FLAG_HT: HT MCS was used and rate_idx is MCS index * @RX_FLAG_VHT: VHT MCS was used and rate_index is MCS index @@ -1058,6 +1060,7 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), RX_FLAG_DECRYPTED = BIT(1), + RX_FLAG_MACTIME_PLCP_START = BIT(2), RX_FLAG_MMIC_STRIPPED = BIT(3), RX_FLAG_IV_STRIPPED = BIT(4), RX_FLAG_FAILED_FCS_CRC = BIT(5), -- cgit v1.2.3 From dfdfc2beb0dd7e3a067d2eeacb4623cb48e77658 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 26 Jan 2016 17:11:13 +0100 Subject: mac80211: Parse legacy and HT rate in injected frames Drivers/devices without their own rate control algorithm can get the information what rates they should use from either the radiotap header of injected frames or from the rate control algorithm. But the parsing of the legacy rate information from the radiotap header was removed in commit e6a9854b05c1 ("mac80211/drivers: rewrite the rate control API"). The removal of this feature heavily reduced the usefulness of frame injection when wanting to simulate specific transmission behavior. Having rate parsing together with MCS rates and retry support allows a fine grained selection of the tx behavior of injected frames for these kind of tests. Signed-off-by: Sven Eckelmann Cc: Simon Wunderlich Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 31337f81ec03..dbcd69a6bfda 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -708,12 +708,14 @@ enum mac80211_tx_info_flags { * protocol frame (e.g. EAP) * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll * frame (PS-Poll or uAPSD). + * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information * * These flags are used in tx_info->control.flags. */ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_PORT_CTRL_PROTO = BIT(0), IEEE80211_TX_CTRL_PS_RESPONSE = BIT(1), + IEEE80211_TX_CTRL_RATE_INJECT = BIT(2), }; /* -- cgit v1.2.3 From f2ac7e301ae6397669ff3f79e691942a9b5d2f39 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Wed, 27 Jan 2016 15:26:12 +0100 Subject: mac80211: expose txq queue depth and size to drivers This will allow drivers to make more educated decisions whether to defer transmission or not. Relying on wake_tx_queue() call count implicitly was not possible because it could be called without queued frame count actually changing on software tx aggregation start/stop code paths. It was also not possible to know how long byte-wise queue was without dequeueing. Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- include/net/mac80211.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index dbcd69a6bfda..fd35fc4d7127 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5596,4 +5596,19 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid); */ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_txq *txq); + +/** + * ieee80211_txq_get_depth - get pending frame/byte count of given txq + * + * The values are not guaranteed to be coherent with regard to each other, i.e. + * txq state can change half-way of this function and the caller may end up + * with "new" frame_cnt and "old" byte_cnt or vice-versa. + * + * @txq: pointer obtained from station or virtual interface + * @frame_cnt: pointer to store frame count + * @byte_cnt: pointer to store byte count + */ +void ieee80211_txq_get_depth(struct ieee80211_txq *txq, + unsigned long *frame_cnt, + unsigned long *byte_cnt); #endif /* MAC80211_H */ -- cgit v1.2.3 From 06470f7468c8b6c95e72ebda803a61a99f4ee446 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Thu, 28 Jan 2016 16:19:25 +0200 Subject: mac80211: add API to allow filtering frames in BA sessions If any frames are dropped that are part of a BA session, the reorder buffer will "indefinitely" (until the timeout) wait for them to come in (or a BAR moving the window) and won't release frames after them. This means it isn't possible to filter frames within a BA session in firmware. Introduce an API function that allows such filtering. Calling this function will move the BA window forward to the new SSN, and allows marking frames after the SSN as having been filtered, so any future reordering activity will release frames while skipping the holes. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index fd35fc4d7127..57147749ae42 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2015 Intel Deutschland GmbH + * Copyright (C) 2015 - 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -5193,6 +5193,24 @@ void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw); void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap, const u8 *addr); +/** + * ieee80211_mark_rx_ba_filtered_frames - move RX BA window and mark filtered + * @pubsta: station struct + * @tid: the session's TID + * @ssn: starting sequence number of the bitmap, all frames before this are + * assumed to be out of the window after the call + * @filtered: bitmap of filtered frames, BIT(0) is the @ssn entry etc. + * @received_mpdus: number of received mpdus in firmware + * + * This function moves the BA window and releases all frames before @ssn, and + * marks frames marked in the bitmap as having been filtered. Afterwards, it + * checks if any frames in the window starting from @ssn can now be released + * (in case they were only waiting for frames that were filtered.) + */ +void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, + u16 ssn, u64 filtered, + u16 received_mpdus); + /** * ieee80211_send_bar - send a BlockAckReq frame * -- cgit v1.2.3 From 34d505193bd10668acf1caba02d2f66bddc23fea Mon Sep 17 00:00:00 2001 From: Lior David Date: Thu, 28 Jan 2016 10:58:25 +0200 Subject: cfg80211: basic support for PBSS network type PBSS (Personal Basic Service Set) is a new BSS type for DMG networks. It is similar to infrastructure BSS, having an AP-like entity called PCP (PBSS Control Point), but it has few differences. PBSS support is mandatory for 11ad devices. Add support for PBSS by introducing a new PBSS flag attribute. The PBSS flag is used in the START_AP command to request starting a PCP instead of an AP, and in the CONNECT command to request connecting to a PCP instead of an AP. Signed-off-by: Lior David Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 8 ++++++++ include/uapi/linux/nl80211.h | 6 ++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9bcaaf7cd15a..9e1b24c29f0c 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -712,6 +712,8 @@ struct cfg80211_acl_data { * @p2p_opp_ps: P2P opportunistic PS * @acl: ACL configuration used by the drivers which has support for * MAC address based access control + * @pbss: If set, start as a PCP instead of AP. Relevant for DMG + * networks. */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -730,6 +732,7 @@ struct cfg80211_ap_settings { u8 p2p_ctwindow; bool p2p_opp_ps; const struct cfg80211_acl_data *acl; + bool pbss; }; /** @@ -1888,6 +1891,8 @@ struct cfg80211_ibss_params { * @ht_capa_mask: The bits of ht_capa which are to be used. * @vht_capa: VHT Capability overrides * @vht_capa_mask: The bits of vht_capa which are to be used. + * @pbss: if set, connect to a PCP instead of AP. Valid for DMG + * networks. */ struct cfg80211_connect_params { struct ieee80211_channel *channel; @@ -1910,6 +1915,7 @@ struct cfg80211_connect_params { struct ieee80211_ht_cap ht_capa_mask; struct ieee80211_vht_cap vht_capa; struct ieee80211_vht_cap vht_capa_mask; + bool pbss; }; /** @@ -3489,6 +3495,7 @@ struct cfg80211_cached_keys; * registered for unexpected class 3 frames (AP mode) * @conn: (private) cfg80211 software SME connection state machine data * @connect_keys: (private) keys to set after connection is established + * @conn_bss_type: connecting/connected BSS type * @ibss_fixed: (private) IBSS is using fixed BSSID * @ibss_dfs_possible: (private) IBSS may change to a DFS channel * @event_list: (private) list for internal event processing @@ -3519,6 +3526,7 @@ struct wireless_dev { u8 ssid_len, mesh_id_len, mesh_id_up_len; struct cfg80211_conn *conn; struct cfg80211_cached_keys *connect_keys; + enum ieee80211_bss_type conn_bss_type; struct list_head event_list; spinlock_t event_lock; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5b7b5ebe7ca8..7758969a2a8e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1789,6 +1789,10 @@ enum nl80211_commands { * thus it must not specify the number of iterations, only the interval * between scans. The scan plans are executed sequentially. * Each scan plan is a nested attribute of &enum nl80211_sched_scan_plan. + * @NL80211_ATTR_PBSS: flag attribute. If set it means operate + * in a PBSS. Specified in %NL80211_CMD_CONNECT to request + * connecting to a PCP, and in %NL80211_CMD_START_AP to start + * a PCP instead of AP. Relevant for DMG networks only. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -2164,6 +2168,8 @@ enum nl80211_attrs { NL80211_ATTR_MAX_SCAN_PLAN_ITERATIONS, NL80211_ATTR_SCHED_SCAN_PLANS, + NL80211_ATTR_PBSS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, -- cgit v1.2.3 From f8079d43cf0f1f0171606e75fcef6fe17bb183f2 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Sun, 14 Feb 2016 13:56:35 +0200 Subject: mac80211: move TKIP TX IVs to public part of key struct Some drivers/devices might want to set the IVs by themselves (and still let mac80211 generate MMIC). Specifically, this is needed when the device does offloading at certain times, and the driver has to make sure that the IVs of new tx frames (from the host) are synchronized with IVs that were potentially used during the offloading. Similarly to CCMP, move the TX IVs of TKIP keys to the public part of the key struct, and export a function to add the IV right into the crypto header. The public tx_pn field is defined as atomic64, so define TKIP_PN_TO_IV16/32 helper macros to convert it to iv16/32 when needed. Since the iv32 used for the p1k cache is taken directly from the frame, we can safely remove iv16/32 from being protected by tkip.txlock. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 57147749ae42..15879b49baad 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1521,9 +1521,8 @@ enum ieee80211_key_flags { * wants to be given when a frame is transmitted and needs to be * encrypted in hardware. * @cipher: The key's cipher suite selector. - * @tx_pn: PN used for TX on non-TKIP keys, may be used by the driver - * as well if it needs to do software PN assignment by itself - * (e.g. due to TSO) + * @tx_pn: PN used for TX keys, may be used by the driver as well if it + * needs to do software PN assignment by itself (e.g. due to TSO) * @flags: key flags, see &enum ieee80211_key_flags. * @keyidx: the key index (0-3) * @keylen: key material length @@ -1549,6 +1548,9 @@ struct ieee80211_key_conf { #define IEEE80211_MAX_PN_LEN 16 +#define TKIP_PN_TO_IV16(pn) ((u16)(pn & 0xffff)) +#define TKIP_PN_TO_IV32(pn) ((u32)((pn >> 16) & 0xffffffff)) + /** * struct ieee80211_key_seq - key sequence counter * @@ -4446,6 +4448,21 @@ void ieee80211_get_tkip_rx_p1k(struct ieee80211_key_conf *keyconf, void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf, struct sk_buff *skb, u8 *p2k); +/** + * ieee80211_tkip_add_iv - write TKIP IV and Ext. IV to pos + * + * @pos: start of crypto header + * @keyconf: the parameter passed with the set key + * @pn: PN to add + * + * Returns: pointer to the octet following IVs (i.e. beginning of + * the packet payload) + * + * This function writes the tkip IV value to pos (which should + * point to the crypto header) + */ +u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn); + /** * ieee80211_get_key_tx_seq - get key TX sequence counter * -- cgit v1.2.3 From ca48ebbc7ea7e82e3ae4b55aacead0cdb54ff008 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Mon, 15 Feb 2016 12:34:10 +0200 Subject: mac80211: remove ieee80211_get_key_tx_seq/ieee80211_set_key_tx_seq Since the PNs of all the tx keys are now tracked in the public part of the key struct (with atomic counter), we no longer need these functions. dvm and vt665{5,6} are currently the only users of these functions, so update them accordingly. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 34 ---------------------------------- 1 file changed, 34 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 15879b49baad..66155d3ad7e6 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4463,23 +4463,6 @@ void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf, */ u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn); -/** - * ieee80211_get_key_tx_seq - get key TX sequence counter - * - * @keyconf: the parameter passed with the set key - * @seq: buffer to receive the sequence data - * - * This function allows a driver to retrieve the current TX IV/PN - * for the given key. It must not be called if IV generation is - * offloaded to the device. - * - * Note that this function may only be called when no TX processing - * can be done concurrently, for example when queues are stopped - * and the stop has been synchronized. - */ -void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf, - struct ieee80211_key_seq *seq); - /** * ieee80211_get_key_rx_seq - get key RX sequence counter * @@ -4499,23 +4482,6 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf, void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq); -/** - * ieee80211_set_key_tx_seq - set key TX sequence counter - * - * @keyconf: the parameter passed with the set key - * @seq: new sequence data - * - * This function allows a driver to set the current TX IV/PNs for the - * given key. This is useful when resuming from WoWLAN sleep and the - * device may have transmitted frames using the PTK, e.g. replies to - * ARP requests. - * - * Note that this function may only be called when no TX processing - * can be done concurrently. - */ -void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf, - struct ieee80211_key_seq *seq); - /** * ieee80211_set_key_rx_seq - set key RX sequence counter * -- cgit v1.2.3 From 65554d07adfc22bb9e14f6df8c609a646f869a74 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 16 Feb 2016 12:48:17 +0200 Subject: mac80211: provide interface to driver to set VHT MU-MIMO data Provide an interface to the lower level driver to set the VHT MU-MIMO data. This is needed for example when there is an update of the group data during low power state, where the management frame will not be passed to the host at all. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 66155d3ad7e6..23f2a5ecf669 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5445,6 +5445,21 @@ ieee80211_vif_type_p2p(struct ieee80211_vif *vif) return ieee80211_iftype_p2p(vif->type, vif->p2p); } +/** + * ieee80211_update_mu_groups - set the VHT MU-MIMO groud data + * + * @vif: the specified virtual interface + * @membership: 64 bits array - a bit is set if station is member of the group + * @position: 2 bits per group id indicating the position in the group + * + * Note: This function assumes that the given vif is valid and the position and + * membership data is of the correct size and are in the same byte order as the + * matching GroupId management frame. + * Calls to this function need to be serialized with RX path. + */ +void ieee80211_update_mu_groups(struct ieee80211_vif *vif, + const u8 *membership, const u8 *position); + void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif, int rssi_min_thold, int rssi_max_thold); -- cgit v1.2.3 From b5a33d52595f0cb153f09bf45a5dcd66a7418dbb Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 16 Feb 2016 12:48:18 +0200 Subject: mac80211: move MU_MIMO_OWNER flag to ieee80211_vif Drivers may need to track which vif is using VHT MU-MIMO. Move the flag indicationg the ownership of MU_MIMO to ieee80211_vif. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 23f2a5ecf669..0c09da34b67a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1382,6 +1382,7 @@ enum ieee80211_vif_flags { * @csa_active: marks whether a channel switch is going on. Internally it is * write-protected by sdata_lock and local->mtx so holding either is fine * for read access. + * @mu_mimo_owner: indicates interface owns MU-MIMO capability * @driver_flags: flags/capabilities the driver has for this interface, * these need to be set (or cleared) when the interface is added * or, if supported by the driver, the interface type is changed @@ -1408,6 +1409,7 @@ struct ieee80211_vif { u8 addr[ETH_ALEN]; bool p2p; bool csa_active; + bool mu_mimo_owner; u8 cab_queue; u8 hw_queue[IEEE80211_NUM_ACS]; -- cgit v1.2.3 From 0c9ca11b1ae8eb16c1b6bbae91991392d2321372 Mon Sep 17 00:00:00 2001 From: Beni Lev Date: Wed, 17 Feb 2016 20:30:00 +0200 Subject: cfg80211: Add global RRM capability Today, the supplicant will add the RRM capabilities Information Element in the association request only if Quiet period is supported (NL80211_FEATURE_QUIET). Quiet is one of many RRM features, and there are other RRM features that are not related to Quiet (e.g. neighbor report). Therefore, requiring Quiet to enable RRM is too restrictive. Some of the features, like neighbor report, can be supported by user space without any help from the kernel. Hence adding the RRM capabilities IE to association request should be the sole user space's decision. Removing the RRM dependency on Quiet in the driver solves this problem, but using an old driver with a user space tool that would not require Quiet feature would be problematic: the user space would add NL80211_ATTR_USE_RRM in the association request even if the kernel doesn't advertize NL80211_FEATURE_QUIET and the association would be denied by the kernel. This solution adds a global RRM capability, that tells user space that it can request RRM capabilities IE publishment without any specific feature support in the kernel. Signed-off-by: Beni Lev Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 7758969a2a8e..5a30a7563633 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1727,6 +1727,8 @@ enum nl80211_commands { * underlying device supports these minimal RRM features: * %NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES, * %NL80211_FEATURE_QUIET, + * Or, if global RRM is supported, see: + * %NL80211_EXT_FEATURE_RRM * If this flag is used, driver must add the Power Capabilities IE to the * association request. In addition, it must also set the RRM capability * flag in the association request's Capability Info field. @@ -4402,12 +4404,18 @@ enum nl80211_feature_flags { /** * enum nl80211_ext_feature_index - bit index of extended features. * @NL80211_EXT_FEATURE_VHT_IBSS: This driver supports IBSS with VHT datarates. + * @NL80211_EXT_FEATURE_RRM: This driver supports RRM. When featured, user can + * can request to use RRM (see %NL80211_ATTR_USE_RRM) with + * %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests, which will set + * the ASSOC_REQ_USE_RRM flag in the association request even if + * NL80211_FEATURE_QUIET is not advertized. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_VHT_IBSS, + NL80211_EXT_FEATURE_RRM, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From 648b50dd6abf8e6e5b589bb8e6873a4596389dbe Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Mon, 25 Jan 2016 12:03:46 +0300 Subject: net: rfkill: add rfkill_find_type function Helper for finding the type based on name. Useful if the type needs to be determined based on device property. Signed-off-by: Heikki Krogerus [modify rfkill_types array and BUILD_BUG_ON to not cause errors] Signed-off-by: Johannes Berg --- include/linux/rfkill.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index 7af625f6d226..e6a0031d1b1f 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -213,6 +213,15 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw); * @rfkill: rfkill struct to query */ bool rfkill_blocked(struct rfkill *rfkill); + +/** + * rfkill_find_type - Helpper for finding rfkill type by name + * @name: the name of the type + * + * Returns enum rfkill_type that conrresponds the name. + */ +enum rfkill_type rfkill_find_type(const char *name); + #else /* !RFKILL */ static inline struct rfkill * __must_check rfkill_alloc(const char *name, @@ -269,6 +278,12 @@ static inline bool rfkill_blocked(struct rfkill *rfkill) { return false; } + +static inline enum rfkill_type rfkill_find_type(const char *name) +{ + return RFKILL_TYPE_ALL; +} + #endif /* RFKILL || RFKILL_MODULE */ -- cgit v1.2.3 From fb2e6b7b7b02ab35a9d5355a69097a6f60c69d38 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Mon, 25 Jan 2016 12:03:49 +0300 Subject: net: rfkill: gpio: remove rfkill_gpio_platform_data No more users for it. Signed-off-by: Heikki Krogerus Signed-off-by: Johannes Berg --- include/linux/rfkill-gpio.h | 37 ------------------------------------- 1 file changed, 37 deletions(-) delete mode 100644 include/linux/rfkill-gpio.h (limited to 'include') diff --git a/include/linux/rfkill-gpio.h b/include/linux/rfkill-gpio.h deleted file mode 100644 index 20bcb55498cd..000000000000 --- a/include/linux/rfkill-gpio.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2011, NVIDIA Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - - -#ifndef __RFKILL_GPIO_H -#define __RFKILL_GPIO_H - -#include -#include - -/** - * struct rfkill_gpio_platform_data - platform data for rfkill gpio device. - * for unused gpio's, the expected value is -1. - * @name: name for the gpio rf kill instance - */ - -struct rfkill_gpio_platform_data { - char *name; - enum rfkill_type type; -}; - -#endif /* __RFKILL_GPIO_H */ -- cgit v1.2.3 From ada68c31ba9c02d7aabdd87db979fe670b499d54 Mon Sep 17 00:00:00 2001 From: Achiad Shochat Date: Mon, 22 Feb 2016 18:17:23 +0200 Subject: net/mlx5: Introduce a new header file for physical port functions All the device physical port access functions are implemented in the port.c file. We just extract the exposure of these functions from driver.h into a dedicated header file called port.h. Signed-off-by: Achiad Shochat Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 31 -------------------- include/linux/mlx5/port.h | 69 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 31 deletions(-) create mode 100644 include/linux/mlx5/port.h (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1e3006dcf35d..02adc67720ce 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -794,37 +794,6 @@ int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, void *data_out, int size_out, u16 reg_num, int arg, int write); -int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps); -int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys, - int ptys_size, int proto_mask, u8 local_port); -int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev, - u32 *proto_cap, int proto_mask); -int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev, - u32 *proto_admin, int proto_mask); -int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev, - u8 *link_width_oper, u8 local_port); -int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev, - u8 *proto_oper, int proto_mask, - u8 local_port); -int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin, - int proto_mask); -int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, - enum mlx5_port_status status); -int mlx5_query_port_admin_status(struct mlx5_core_dev *dev, - enum mlx5_port_status *status); - -int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port); -void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port); -void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu, - u8 port); - -int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev, - u8 *vl_hw_cap, u8 local_port); - -int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause); -int mlx5_query_port_pause(struct mlx5_core_dev *dev, - u32 *rx_pause, u32 *tx_pause); - int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq); void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq); int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq, diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h new file mode 100644 index 000000000000..7accd4a65da5 --- /dev/null +++ b/include/linux/mlx5/port.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MLX5_PORT_H__ +#define __MLX5_PORT_H__ + +#include + +int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps); +int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys, + int ptys_size, int proto_mask, u8 local_port); +int mlx5_query_port_proto_cap(struct mlx5_core_dev *dev, + u32 *proto_cap, int proto_mask); +int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev, + u32 *proto_admin, int proto_mask); +int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev, + u8 *link_width_oper, u8 local_port); +int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev, + u8 *proto_oper, int proto_mask, + u8 local_port); +int mlx5_set_port_proto(struct mlx5_core_dev *dev, u32 proto_admin, + int proto_mask); +int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, + enum mlx5_port_status status); +int mlx5_query_port_admin_status(struct mlx5_core_dev *dev, + enum mlx5_port_status *status); + +int mlx5_set_port_mtu(struct mlx5_core_dev *dev, int mtu, u8 port); +void mlx5_query_port_max_mtu(struct mlx5_core_dev *dev, int *max_mtu, u8 port); +void mlx5_query_port_oper_mtu(struct mlx5_core_dev *dev, int *oper_mtu, + u8 port); + +int mlx5_query_port_vl_hw_cap(struct mlx5_core_dev *dev, + u8 *vl_hw_cap, u8 local_port); + +int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause); +int mlx5_query_port_pause(struct mlx5_core_dev *dev, + u32 *rx_pause, u32 *tx_pause); + +#endif /* __MLX5_PORT_H__ */ -- cgit v1.2.3 From ad909eb064219a64fd10e9c7d9f39a3042760025 Mon Sep 17 00:00:00 2001 From: Achiad Shochat Date: Mon, 22 Feb 2016 18:17:24 +0200 Subject: net/mlx5: Introduce physical port PFC access functions Add access functions to set and query a physical port PFC (Priority Flow Control) parameters. Signed-off-by: Achiad Shochat Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/port.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 7accd4a65da5..4b3644caa936 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -66,4 +66,8 @@ int mlx5_set_port_pause(struct mlx5_core_dev *dev, u32 rx_pause, u32 tx_pause); int mlx5_query_port_pause(struct mlx5_core_dev *dev, u32 *rx_pause, u32 *tx_pause); +int mlx5_set_port_pfc(struct mlx5_core_dev *dev, u8 pfc_en_tx, u8 pfc_en_rx); +int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx, + u8 *pfc_en_rx); + #endif /* __MLX5_PORT_H__ */ -- cgit v1.2.3 From 4f3961eeafe0aca8f6b0933899ef0d91f561352d Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Mon, 22 Feb 2016 18:17:25 +0200 Subject: net/mlx5: Introduce physical port TC/prio access functions Add access functions to set and query a physical port TC groups and prio parameters. Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 2 ++ include/linux/mlx5/mlx5_ifc.h | 49 ++++++++++++++++++++++++++++++++++++++++++- include/linux/mlx5/port.h | 6 ++++++ 3 files changed, 56 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 02adc67720ce..a815da92d4eb 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -99,6 +99,8 @@ enum { }; enum { + MLX5_REG_QETCR = 0x4005, + MLX5_REG_QTCT = 0x400a, MLX5_REG_PCAP = 0x5001, MLX5_REG_PMTU = 0x5003, MLX5_REG_PTYS = 0x5004, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 51f1e540fc2b..ec957e059de8 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -729,7 +729,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_1bf[0x3]; u8 log_max_msg[0x5]; - u8 reserved_at_1c7[0x18]; + u8 reserved_at_1c7[0x4]; + u8 max_tc[0x4]; + u8 reserved_at_1cf[0x10]; u8 stat_rate_support[0x10]; u8 reserved_at_1ef[0xc]; @@ -7061,4 +7063,49 @@ struct mlx5_ifc_modify_flow_table_in_bits { u8 reserved_at_100[0x100]; }; +struct mlx5_ifc_ets_tcn_config_reg_bits { + u8 g[0x1]; + u8 b[0x1]; + u8 r[0x1]; + u8 reserved_at_3[0x9]; + u8 group[0x4]; + u8 reserved_at_10[0x9]; + u8 bw_allocation[0x7]; + + u8 reserved_at_20[0xc]; + u8 max_bw_units[0x4]; + u8 reserved_at_30[0x8]; + u8 max_bw_value[0x8]; +}; + +struct mlx5_ifc_ets_global_config_reg_bits { + u8 reserved_at_0[0x2]; + u8 r[0x1]; + u8 reserved_at_3[0x1d]; + + u8 reserved_at_20[0xc]; + u8 max_bw_units[0x4]; + u8 reserved_at_30[0x8]; + u8 max_bw_value[0x8]; +}; + +struct mlx5_ifc_qetc_reg_bits { + u8 reserved_at_0[0x8]; + u8 port_number[0x8]; + u8 reserved_at_10[0x30]; + + struct mlx5_ifc_ets_tcn_config_reg_bits tc_configuration[0x8]; + struct mlx5_ifc_ets_global_config_reg_bits global_configuration; +}; + +struct mlx5_ifc_qtct_reg_bits { + u8 reserved_at_0[0x8]; + u8 port_number[0x8]; + u8 reserved_at_10[0xd]; + u8 prio[0x3]; + + u8 reserved_at_20[0x1d]; + u8 tclass[0x3]; +}; + #endif /* MLX5_IFC_H */ diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 4b3644caa936..0c67e699d017 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -70,4 +70,10 @@ int mlx5_set_port_pfc(struct mlx5_core_dev *dev, u8 pfc_en_tx, u8 pfc_en_rx); int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx, u8 *pfc_en_rx); +int mlx5_max_tc(struct mlx5_core_dev *mdev); + +int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc); +int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group); +int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw); + #endif /* __MLX5_PORT_H__ */ -- cgit v1.2.3 From d8880795dabf2381ed1e98348f6d9c7ea6fab950 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 22 Feb 2016 18:17:28 +0200 Subject: net/mlx5e: Implement DCBNL IEEE max rate Add support for DCBNL IEEE get/set max rate. Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 6 ++++++ include/linux/mlx5/port.h | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 987764afa65c..bfc1ab0552d3 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -350,6 +350,12 @@ enum { MLX5_SET_PORT_PKEY_TABLE = 20, }; +enum { + MLX5_BW_NO_LIMIT = 0, + MLX5_100_MBPS_UNIT = 3, + MLX5_GBPS_UNIT = 4, +}; + enum { MLX5_MAX_PAGE_SHIFT = 31 }; diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 0c67e699d017..595c7b2d9bfa 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -75,5 +75,11 @@ int mlx5_max_tc(struct mlx5_core_dev *mdev); int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc); int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group); int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw); +int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev, + u8 *max_bw_value, + u8 *max_bw_unit); +int mlx5_query_port_ets_rate_limit(struct mlx5_core_dev *mdev, + u8 *max_bw_value, + u8 *max_bw_unit); #endif /* __MLX5_PORT_H__ */ -- cgit v1.2.3 From 928cfe8745a62e60c1e8e06676a74724e7786024 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 22 Feb 2016 18:17:29 +0200 Subject: net/mlx5e: Wake On LAN support Implement set/get WOL by ethtool and added the needed device commands and structures to mlx5_ifc. Signed-off-by: Tariq Toukan Signed-off-by: Rana Shahout Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 11 ++++++++ include/linux/mlx5/mlx5_ifc.h | 62 ++++++++++++++++++++++++++++++++++++++++++- include/linux/mlx5/port.h | 2 ++ 3 files changed, 74 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index bfc1ab0552d3..68a56bc37df2 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1183,6 +1183,17 @@ enum { MLX5_RQC_RQ_TYPE_MEMORY_RQ_RPM = 0x1, }; +enum mlx5_wol_mode { + MLX5_WOL_DISABLE = 0, + MLX5_WOL_SECURED_MAGIC = 1 << 1, + MLX5_WOL_MAGIC = 1 << 2, + MLX5_WOL_ARP = 1 << 3, + MLX5_WOL_BROADCAST = 1 << 4, + MLX5_WOL_MULTICAST = 1 << 5, + MLX5_WOL_UNICAST = 1 << 6, + MLX5_WOL_PHY_ACTIVITY = 1 << 7, +}; + /* MLX5 DEV CAPs */ /* TODO: EAT.ME */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ec957e059de8..03ffe9530365 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -166,6 +166,8 @@ enum { MLX5_CMD_OP_SET_L2_TABLE_ENTRY = 0x829, MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY = 0x82a, MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY = 0x82b, + MLX5_CMD_OP_SET_WOL_ROL = 0x830, + MLX5_CMD_OP_QUERY_WOL_ROL = 0x831, MLX5_CMD_OP_CREATE_TIR = 0x900, MLX5_CMD_OP_MODIFY_TIR = 0x901, MLX5_CMD_OP_DESTROY_TIR = 0x902, @@ -731,7 +733,17 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_msg[0x5]; u8 reserved_at_1c7[0x4]; u8 max_tc[0x4]; - u8 reserved_at_1cf[0x10]; + u8 reserved_at_1cf[0x6]; + u8 rol_s[0x1]; + u8 rol_g[0x1]; + u8 reserved_at_1d7[0x1]; + u8 wol_s[0x1]; + u8 wol_g[0x1]; + u8 wol_a[0x1]; + u8 wol_b[0x1]; + u8 wol_m[0x1]; + u8 wol_u[0x1]; + u8 wol_p[0x1]; u8 stat_rate_support[0x10]; u8 reserved_at_1ef[0xc]; @@ -6873,6 +6885,54 @@ struct mlx5_ifc_mtt_bits { u8 rd_en[0x1]; }; +struct mlx5_ifc_query_wol_rol_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x10]; + u8 rol_mode[0x8]; + u8 wol_mode[0x8]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_query_wol_rol_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_set_wol_rol_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_set_wol_rol_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 rol_mode_valid[0x1]; + u8 wol_mode_valid[0x1]; + u8 reserved_at_42[0xe]; + u8 rol_mode[0x8]; + u8 wol_mode[0x8]; + + u8 reserved_at_60[0x20]; +}; + enum { MLX5_INITIAL_SEG_NIC_INTERFACE_FULL_DRIVER = 0x0, MLX5_INITIAL_SEG_NIC_INTERFACE_DISABLED = 0x1, diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 595c7b2d9bfa..a1d145abd4eb 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -81,5 +81,7 @@ int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev, int mlx5_query_port_ets_rate_limit(struct mlx5_core_dev *mdev, u8 *max_bw_value, u8 *max_bw_unit); +int mlx5_set_port_wol(struct mlx5_core_dev *mdev, u8 wol_mode); +int mlx5_query_port_wol(struct mlx5_core_dev *mdev, u8 *wol_mode); #endif /* __MLX5_PORT_H__ */ -- cgit v1.2.3 From 1d4150c02c5709fdfd80f10368a31867de35e72e Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 22 Feb 2016 15:57:52 -0800 Subject: net_sched: prepare tcf_hashinfo_destroy() for netns support We only release the memory of the hashtable itself, not its entries inside. This is not a problem yet since we only call it in module release path, and module is refcount'ed by actions. This would be a problem after we move the per module hinfo into per netns in the latter patch. Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 9d446f136607..8c4e3ff723fb 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -65,11 +65,6 @@ static inline int tcf_hashinfo_init(struct tcf_hashinfo *hf, unsigned int mask) return 0; } -static inline void tcf_hashinfo_destroy(struct tcf_hashinfo *hf) -{ - kfree(hf->htab); -} - /* Update lastuse only if needed, to avoid dirtying a cache line. * We use a temp variable to avoid fetching jiffies twice. */ -- cgit v1.2.3 From ddf97ccdd7cb7e00daba465a5c947b8d941dc2a4 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 22 Feb 2016 15:57:53 -0800 Subject: net_sched: add network namespace support for tc actions Currently tc actions are stored in a per-module hashtable, therefore are visible to all network namespaces. This is probably the last part of the tc subsystem which is not aware of netns now. This patch makes them per-netns, several tc action API's need to be adjusted for this. The tc action API code is ugly due to historical reasons, we need to refactor that code in the future. Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 58 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 8c4e3ff723fb..342be6c5ab5c 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -7,6 +7,8 @@ #include #include +#include +#include struct tcf_common { struct hlist_node tcfc_head; @@ -87,31 +89,65 @@ struct tc_action { __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ __u32 order; struct list_head list; + struct tcf_hashinfo *hinfo; }; struct tc_action_ops { struct list_head head; - struct tcf_hashinfo *hinfo; char kind[IFNAMSIZ]; __u32 type; /* TBD to match kind */ struct module *owner; int (*act)(struct sk_buff *, const struct tc_action *, struct tcf_result *); int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *, int bind); - int (*lookup)(struct tc_action *, u32); + int (*lookup)(struct net *, struct tc_action *, u32); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *act, int ovr, int bind); - int (*walk)(struct sk_buff *, struct netlink_callback *, int, struct tc_action *); + int (*walk)(struct net *, struct sk_buff *, + struct netlink_callback *, int, struct tc_action *); +}; + +struct tc_action_net { + struct tcf_hashinfo *hinfo; + const struct tc_action_ops *ops; }; -int tcf_hash_search(struct tc_action *a, u32 index); -u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo); -int tcf_hash_check(u32 index, struct tc_action *a, int bind); -int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, - int size, int bind, bool cpustats); +static inline +int tc_action_net_init(struct tc_action_net *tn, const struct tc_action_ops *ops, + unsigned int mask) +{ + int err = 0; + + tn->hinfo = kmalloc(sizeof(*tn->hinfo), GFP_KERNEL); + if (!tn->hinfo) + return -ENOMEM; + tn->ops = ops; + err = tcf_hashinfo_init(tn->hinfo, mask); + if (err) + kfree(tn->hinfo); + return err; +} + +void tcf_hashinfo_destroy(const struct tc_action_ops *ops, + struct tcf_hashinfo *hinfo); + +static inline void tc_action_net_exit(struct tc_action_net *tn) +{ + tcf_hashinfo_destroy(tn->ops, tn->hinfo); +} + +int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a); +int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index); +u32 tcf_hash_new_index(struct tc_action_net *tn); +int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, + int bind); +int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est, + struct tc_action *a, int size, int bind, bool cpustats); void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est); -void tcf_hash_insert(struct tc_action *a); +void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a); int __tcf_hash_release(struct tc_action *a, bool bind, bool strict); @@ -120,8 +156,8 @@ static inline int tcf_hash_release(struct tc_action *a, bool bind) return __tcf_hash_release(a, bind, false); } -int tcf_register_action(struct tc_action_ops *a, unsigned int mask); -int tcf_unregister_action(struct tc_action_ops *a); +int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops); +int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_action_destroy(struct list_head *actions, int bind); int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, struct tcf_result *res); -- cgit v1.2.3 From 65aebfc002abc1827ac7c8644a2bba0459ce3ce2 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 23 Feb 2016 12:13:54 -0500 Subject: net: dsa: add port_vlan_dump routine Similar to port_fdb_dump, add a port_vlan_dump function to DSA drivers which gets passed the switchdev VLAN object and callback. This function, if implemented, takes precedence over the soon legacy vlan_getnext/port_pvid_get approach. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 1c845d7bf0b2..ebc0d9ea96a1 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -313,6 +313,9 @@ struct dsa_switch_driver { struct switchdev_trans *trans); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); + int (*port_vlan_dump)(struct dsa_switch *ds, int port, + struct switchdev_obj_port_vlan *vlan, + int (*cb)(struct switchdev_obj *obj)); int (*port_pvid_get)(struct dsa_switch *ds, int port, u16 *pvid); int (*vlan_getnext)(struct dsa_switch *ds, u16 *vid, unsigned long *ports, unsigned long *untagged); -- cgit v1.2.3 From 477b184526a7f44164029eea720da0e0c888cac6 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 23 Feb 2016 12:13:56 -0500 Subject: net: dsa: drop vlan_getnext The VLAN GetNext operation is specific to some switches, and thus can be complicated to implement for some drivers. Remove the support for the vlan_getnext/port_pvid_get approach in favor of the generic and simpler port_vlan_dump function. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index ebc0d9ea96a1..3dd54867174a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -316,9 +316,6 @@ struct dsa_switch_driver { int (*port_vlan_dump)(struct dsa_switch *ds, int port, struct switchdev_obj_port_vlan *vlan, int (*cb)(struct switchdev_obj *obj)); - int (*port_pvid_get)(struct dsa_switch *ds, int port, u16 *pvid); - int (*vlan_getnext)(struct dsa_switch *ds, u16 *vid, - unsigned long *ports, unsigned long *untagged); /* * Forwarding database -- cgit v1.2.3 From f1705ec197e705b79ea40fe7a2cc5acfa1d3bfac Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 24 Feb 2016 09:25:37 -0800 Subject: net: ipv6: Make address flushing on ifdown optional Currently, all ipv6 addresses are flushed when the interface is configured down, including global, static addresses: $ ip -6 addr show dev eth1 3: eth1: mtu 1500 state UP qlen 1000 inet6 2100:1::2/120 scope global valid_lft forever preferred_lft forever inet6 fe80::e0:f9ff:fe79:34bd/64 scope link valid_lft forever preferred_lft forever $ ip link set dev eth1 down $ ip -6 addr show dev eth1 << nothing; all addresses have been flushed>> Add a new sysctl to make this behavior optional. The new setting defaults to flush all addresses to maintain backwards compatibility. When the set global addresses with no expire times are not flushed on an admin down. The sysctl is per-interface or system-wide for all interfaces $ sysctl -w net.ipv6.conf.eth1.keep_addr_on_down=1 or $ sysctl -w net.ipv6.conf.all.keep_addr_on_down=1 Will keep addresses on eth1 on an admin down. $ ip -6 addr show dev eth1 3: eth1: mtu 1500 state UP qlen 1000 inet6 2100:1::2/120 scope global valid_lft forever preferred_lft forever inet6 fe80::e0:f9ff:fe79:34bd/64 scope link valid_lft forever preferred_lft forever $ ip link set dev eth1 down $ ip -6 addr show dev eth1 3: eth1: mtu 1500 state DOWN qlen 1000 inet6 2100:1::2/120 scope global tentative valid_lft forever preferred_lft forever inet6 fe80::e0:f9ff:fe79:34bd/64 scope link tentative valid_lft forever preferred_lft forever Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 4b2267e1b7c3..7edc14fb66b6 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -62,6 +62,7 @@ struct ipv6_devconf { struct in6_addr secret; } stable_secret; __s32 use_oif_addrs_only; + __s32 keep_addr_on_down; void *sysctl; }; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index ec117b65d5a5..395876060f50 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -176,6 +176,7 @@ enum { DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, DEVCONF_DROP_UNSOLICITED_NA, + DEVCONF_KEEP_ADDR_ON_DOWN, DEVCONF_MAX }; -- cgit v1.2.3 From a87cb3e48ee86d29868d3f59cfb9ce1a8fa63314 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 24 Feb 2016 10:02:52 -0800 Subject: net: Facility to report route quality of connected sockets This patch add the SO_CNX_ADVICE socket option (setsockopt only). The purpose is to allow an application to give feedback to the kernel about the quality of the network path for a connected socket. The value argument indicates the type of quality report. For this initial patch the only supported advice is a value of 1 which indicates "bad path, please reroute"-- the action taken by the kernel is to call dst_negative_advice which will attempt to choose a different ECMP route, reset the TX hash for flow label and UDP source port in encapsulation, etc. This facility should be useful for connected UDP sockets where only the application can provide any feedback about path quality. It could also be useful for TCP applications that have additional knowledge about the path outside of the normal TCP control loop. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/asm-generic/socket.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index fb8a41668382..67d632f1743d 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -90,4 +90,6 @@ #define SO_ATTACH_REUSEPORT_CBPF 51 #define SO_ATTACH_REUSEPORT_EBPF 52 +#define SO_CNX_ADVICE 53 + #endif /* __ASM_GENERIC_SOCKET_H */ -- cgit v1.2.3 From 3f1ac7a700d039c61d8d8b99f28d605d489a60cf Mon Sep 17 00:00:00 2001 From: David Decotigny Date: Wed, 24 Feb 2016 10:57:59 -0800 Subject: net: ethtool: add new ETHTOOL_xLINKSETTINGS API This patch defines a new ETHTOOL_GLINKSETTINGS/SLINKSETTINGS API, handled by the new get_link_ksettings/set_link_ksettings callbacks. This API provides support for most legacy ethtool_cmd fields, adds support for larger link mode masks (up to 4064 bits, variable length), and removes ethtool_cmd deprecated fields (transceiver/maxrxpkt/maxtxpkt). This API is deprecating the legacy ETHTOOL_GSET/SSET API and provides the following backward compatibility properties: - legacy ethtool with legacy drivers: no change, still using the get_settings/set_settings callbacks. - legacy ethtool with new get/set_link_ksettings drivers: the new driver callbacks are used, data internally converted to legacy ethtool_cmd. ETHTOOL_GSET will return only the 1st 32b of each link mode mask. ETHTOOL_SSET will fail if user tries to set the ethtool_cmd deprecated fields to non-0 (transceiver/maxrxpkt/maxtxpkt). A kernel warning is logged if driver sets higher bits. - future ethtool with legacy drivers: no change, still using the get_settings/set_settings callbacks, internally converted to new data structure. Deprecated fields (transceiver/maxrxpkt/maxtxpkt) will be ignored and seen as 0 from user space. Note that that "future" ethtool tool will not allow changes to these deprecated fields. - future ethtool with new drivers: direct call to the new callbacks. By "future" ethtool, what is meant is: - query: first try ETHTOOL_GLINKSETTINGS, and revert to ETHTOOL_GSET if fails - set: query first and remember which of ETHTOOL_GLINKSETTINGS or ETHTOOL_GSET was successful + if ETHTOOL_GLINKSETTINGS was successful, then change config with ETHTOOL_SLINKSETTINGS. A failure there is final (do not try ETHTOOL_SSET). + otherwise ETHTOOL_GSET was successful, change config with ETHTOOL_SSET. A failure there is final (do not try ETHTOOL_SLINKSETTINGS). The interaction user/kernel via the new API requires a small ETHTOOL_GLINKSETTINGS handshake first to agree on the length of the link mode bitmaps. If kernel doesn't agree with user, it returns the bitmap length it is expecting from user as a negative length (and cmd field is 0). When kernel and user agree, kernel returns valid info in all fields (ie. link mode length > 0 and cmd is ETHTOOL_GLINKSETTINGS). Data structure crossing user/kernel boundary is 32/64-bit agnostic. Converted internally to a legal kernel bitmap. The internal __ethtool_get_settings kernel helper will gradually be replaced by __ethtool_get_link_ksettings by the time the first "link_settings" drivers start to appear. So this patch doesn't change it, it will be removed before it needs to be changed. Signed-off-by: David Decotigny Signed-off-by: David S. Miller --- include/linux/ethtool.h | 91 ++++++++++-- include/uapi/linux/ethtool.h | 322 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 339 insertions(+), 74 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 472d7d7b01c2..8a400a54c92e 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -12,6 +12,7 @@ #ifndef _LINUX_ETHTOOL_H #define _LINUX_ETHTOOL_H +#include #include #include @@ -40,9 +41,6 @@ struct compat_ethtool_rxnfc { #include -extern int __ethtool_get_settings(struct net_device *dev, - struct ethtool_cmd *cmd); - /** * enum ethtool_phys_id_state - indicator state for physical identification * @ETHTOOL_ID_INACTIVE: Physical ID indicator should be deactivated @@ -97,13 +95,74 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) return index % n_rx_rings; } +/* number of link mode bits/ulongs handled internally by kernel */ +#define __ETHTOOL_LINK_MODE_MASK_NBITS \ + (__ETHTOOL_LINK_MODE_LAST + 1) + +/* declare a link mode bitmap */ +#define __ETHTOOL_DECLARE_LINK_MODE_MASK(name) \ + DECLARE_BITMAP(name, __ETHTOOL_LINK_MODE_MASK_NBITS) + +/* drivers must ignore base.cmd and base.link_mode_masks_nwords + * fields, but they are allowed to overwrite them (will be ignored). + */ +struct ethtool_link_ksettings { + struct ethtool_link_settings base; + struct { + __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); + __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); + __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising); + } link_modes; +}; + +/** + * ethtool_link_ksettings_zero_link_mode - clear link_ksettings link mode mask + * @ptr : pointer to struct ethtool_link_ksettings + * @name : one of supported/advertising/lp_advertising + */ +#define ethtool_link_ksettings_zero_link_mode(ptr, name) \ + bitmap_zero((ptr)->link_modes.name, __ETHTOOL_LINK_MODE_MASK_NBITS) + +/** + * ethtool_link_ksettings_add_link_mode - set bit in link_ksettings + * link mode mask + * @ptr : pointer to struct ethtool_link_ksettings + * @name : one of supported/advertising/lp_advertising + * @mode : one of the ETHTOOL_LINK_MODE_*_BIT + * (not atomic, no bound checking) + */ +#define ethtool_link_ksettings_add_link_mode(ptr, name, mode) \ + __set_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name) + +/** + * ethtool_link_ksettings_test_link_mode - test bit in ksettings link mode mask + * @ptr : pointer to struct ethtool_link_ksettings + * @name : one of supported/advertising/lp_advertising + * @mode : one of the ETHTOOL_LINK_MODE_*_BIT + * (not atomic, no bound checking) + * + * Returns true/false. + */ +#define ethtool_link_ksettings_test_link_mode(ptr, name, mode) \ + test_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name) + +extern int +__ethtool_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *link_ksettings); + +/* DEPRECATED, use __ethtool_get_link_ksettings */ +extern int __ethtool_get_settings(struct net_device *dev, + struct ethtool_cmd *cmd); + /** * struct ethtool_ops - optional netdev operations - * @get_settings: Get various device settings including Ethernet link + * @get_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings + * API. Get various device settings including Ethernet link * settings. The @cmd parameter is expected to have been cleared - * before get_settings is called. Returns a negative error code or - * zero. - * @set_settings: Set various device settings including Ethernet link + * before get_settings is called. Returns a negative error code + * or zero. + * @set_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings + * API. Set various device settings including Ethernet link * settings. Returns a negative error code or zero. * @get_drvinfo: Report driver/device information. Should only set the * @driver, @version, @fw_version and @bus_info fields. If not @@ -211,6 +270,19 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) * a TX queue has this number, return -EINVAL. If only a RX queue or a TX * queue has this number, ignore the inapplicable fields. * Returns a negative error code or zero. + * @get_link_ksettings: When defined, takes precedence over the + * %get_settings method. Get various device settings + * including Ethernet link settings. The %cmd and + * %link_mode_masks_nwords fields should be ignored (use + * %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), any + * change to them will be overwritten by kernel. Returns a + * negative error code or zero. + * @set_link_ksettings: When defined, takes precedence over the + * %set_settings method. Set various device settings including + * Ethernet link settings. The %cmd and %link_mode_masks_nwords + * fields should be ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS + * instead of the latter), any change to them will be overwritten + * by kernel. Returns a negative error code or zero. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -293,6 +365,9 @@ struct ethtool_ops { struct ethtool_coalesce *); int (*set_per_queue_coalesce)(struct net_device *, u32, struct ethtool_coalesce *); - + int (*get_link_ksettings)(struct net_device *, + struct ethtool_link_ksettings *); + int (*set_link_ksettings)(struct net_device *, + const struct ethtool_link_ksettings *); }; #endif /* _LINUX_ETHTOOL_H */ diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index f15ae02621a1..37fd6dc33de4 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -21,7 +21,8 @@ */ /** - * struct ethtool_cmd - link control and status + * struct ethtool_cmd - DEPRECATED, link control and status + * This structure is DEPRECATED, please use struct ethtool_link_settings. * @cmd: Command number = %ETHTOOL_GSET or %ETHTOOL_SSET * @supported: Bitmask of %SUPPORTED_* flags for the link modes, * physical connectors and other link features for which the @@ -1219,8 +1220,12 @@ struct ethtool_per_queue_op { }; /* CMDs currently supported */ -#define ETHTOOL_GSET 0x00000001 /* Get settings. */ -#define ETHTOOL_SSET 0x00000002 /* Set settings. */ +#define ETHTOOL_GSET 0x00000001 /* DEPRECATED, Get settings. + * Please use ETHTOOL_GLINKSETTINGS + */ +#define ETHTOOL_SSET 0x00000002 /* DEPRECATED, Set settings. + * Please use ETHTOOL_SLINKSETTINGS + */ #define ETHTOOL_GDRVINFO 0x00000003 /* Get driver info. */ #define ETHTOOL_GREGS 0x00000004 /* Get NIC registers. */ #define ETHTOOL_GWOL 0x00000005 /* Get wake-on-lan options. */ @@ -1302,73 +1307,139 @@ struct ethtool_per_queue_op { #define ETHTOOL_PERQUEUE 0x0000004b /* Set per queue options */ +#define ETHTOOL_GLINKSETTINGS 0x0000004c /* Get ethtool_link_settings */ +#define ETHTOOL_SLINKSETTINGS 0x0000004d /* Set ethtool_link_settings */ + + /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET #define SPARC_ETH_SSET ETHTOOL_SSET -#define SUPPORTED_10baseT_Half (1 << 0) -#define SUPPORTED_10baseT_Full (1 << 1) -#define SUPPORTED_100baseT_Half (1 << 2) -#define SUPPORTED_100baseT_Full (1 << 3) -#define SUPPORTED_1000baseT_Half (1 << 4) -#define SUPPORTED_1000baseT_Full (1 << 5) -#define SUPPORTED_Autoneg (1 << 6) -#define SUPPORTED_TP (1 << 7) -#define SUPPORTED_AUI (1 << 8) -#define SUPPORTED_MII (1 << 9) -#define SUPPORTED_FIBRE (1 << 10) -#define SUPPORTED_BNC (1 << 11) -#define SUPPORTED_10000baseT_Full (1 << 12) -#define SUPPORTED_Pause (1 << 13) -#define SUPPORTED_Asym_Pause (1 << 14) -#define SUPPORTED_2500baseX_Full (1 << 15) -#define SUPPORTED_Backplane (1 << 16) -#define SUPPORTED_1000baseKX_Full (1 << 17) -#define SUPPORTED_10000baseKX4_Full (1 << 18) -#define SUPPORTED_10000baseKR_Full (1 << 19) -#define SUPPORTED_10000baseR_FEC (1 << 20) -#define SUPPORTED_20000baseMLD2_Full (1 << 21) -#define SUPPORTED_20000baseKR2_Full (1 << 22) -#define SUPPORTED_40000baseKR4_Full (1 << 23) -#define SUPPORTED_40000baseCR4_Full (1 << 24) -#define SUPPORTED_40000baseSR4_Full (1 << 25) -#define SUPPORTED_40000baseLR4_Full (1 << 26) -#define SUPPORTED_56000baseKR4_Full (1 << 27) -#define SUPPORTED_56000baseCR4_Full (1 << 28) -#define SUPPORTED_56000baseSR4_Full (1 << 29) -#define SUPPORTED_56000baseLR4_Full (1 << 30) - -#define ADVERTISED_10baseT_Half (1 << 0) -#define ADVERTISED_10baseT_Full (1 << 1) -#define ADVERTISED_100baseT_Half (1 << 2) -#define ADVERTISED_100baseT_Full (1 << 3) -#define ADVERTISED_1000baseT_Half (1 << 4) -#define ADVERTISED_1000baseT_Full (1 << 5) -#define ADVERTISED_Autoneg (1 << 6) -#define ADVERTISED_TP (1 << 7) -#define ADVERTISED_AUI (1 << 8) -#define ADVERTISED_MII (1 << 9) -#define ADVERTISED_FIBRE (1 << 10) -#define ADVERTISED_BNC (1 << 11) -#define ADVERTISED_10000baseT_Full (1 << 12) -#define ADVERTISED_Pause (1 << 13) -#define ADVERTISED_Asym_Pause (1 << 14) -#define ADVERTISED_2500baseX_Full (1 << 15) -#define ADVERTISED_Backplane (1 << 16) -#define ADVERTISED_1000baseKX_Full (1 << 17) -#define ADVERTISED_10000baseKX4_Full (1 << 18) -#define ADVERTISED_10000baseKR_Full (1 << 19) -#define ADVERTISED_10000baseR_FEC (1 << 20) -#define ADVERTISED_20000baseMLD2_Full (1 << 21) -#define ADVERTISED_20000baseKR2_Full (1 << 22) -#define ADVERTISED_40000baseKR4_Full (1 << 23) -#define ADVERTISED_40000baseCR4_Full (1 << 24) -#define ADVERTISED_40000baseSR4_Full (1 << 25) -#define ADVERTISED_40000baseLR4_Full (1 << 26) -#define ADVERTISED_56000baseKR4_Full (1 << 27) -#define ADVERTISED_56000baseCR4_Full (1 << 28) -#define ADVERTISED_56000baseSR4_Full (1 << 29) -#define ADVERTISED_56000baseLR4_Full (1 << 30) +/* Link mode bit indices */ +enum ethtool_link_mode_bit_indices { + ETHTOOL_LINK_MODE_10baseT_Half_BIT = 0, + ETHTOOL_LINK_MODE_10baseT_Full_BIT = 1, + ETHTOOL_LINK_MODE_100baseT_Half_BIT = 2, + ETHTOOL_LINK_MODE_100baseT_Full_BIT = 3, + ETHTOOL_LINK_MODE_1000baseT_Half_BIT = 4, + ETHTOOL_LINK_MODE_1000baseT_Full_BIT = 5, + ETHTOOL_LINK_MODE_Autoneg_BIT = 6, + ETHTOOL_LINK_MODE_TP_BIT = 7, + ETHTOOL_LINK_MODE_AUI_BIT = 8, + ETHTOOL_LINK_MODE_MII_BIT = 9, + ETHTOOL_LINK_MODE_FIBRE_BIT = 10, + ETHTOOL_LINK_MODE_BNC_BIT = 11, + ETHTOOL_LINK_MODE_10000baseT_Full_BIT = 12, + ETHTOOL_LINK_MODE_Pause_BIT = 13, + ETHTOOL_LINK_MODE_Asym_Pause_BIT = 14, + ETHTOOL_LINK_MODE_2500baseX_Full_BIT = 15, + ETHTOOL_LINK_MODE_Backplane_BIT = 16, + ETHTOOL_LINK_MODE_1000baseKX_Full_BIT = 17, + ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT = 18, + ETHTOOL_LINK_MODE_10000baseKR_Full_BIT = 19, + ETHTOOL_LINK_MODE_10000baseR_FEC_BIT = 20, + ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT = 21, + ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT = 22, + ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT = 23, + ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT = 24, + ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT = 25, + ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT = 26, + ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT = 27, + ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT = 28, + ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT = 29, + ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT = 30, + + /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit + * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* + * macro for bits > 31. The only way to use indices > 31 is to + * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. + */ + + __ETHTOOL_LINK_MODE_LAST + = ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT, +}; + +#define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \ + (1UL << (ETHTOOL_LINK_MODE_ ## base_name ## _BIT)) + +/* DEPRECATED macros. Please migrate to + * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT + * define any new SUPPORTED_* macro for bits > 31. + */ +#define SUPPORTED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) +#define SUPPORTED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) +#define SUPPORTED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) +#define SUPPORTED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) +#define SUPPORTED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) +#define SUPPORTED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) +#define SUPPORTED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) +#define SUPPORTED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) +#define SUPPORTED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) +#define SUPPORTED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) +#define SUPPORTED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) +#define SUPPORTED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) +#define SUPPORTED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) +#define SUPPORTED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) +#define SUPPORTED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) +#define SUPPORTED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) +#define SUPPORTED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) +#define SUPPORTED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) +#define SUPPORTED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) +#define SUPPORTED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) +#define SUPPORTED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) +#define SUPPORTED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) +#define SUPPORTED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) +#define SUPPORTED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) +#define SUPPORTED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) +#define SUPPORTED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) +#define SUPPORTED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) +#define SUPPORTED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) +#define SUPPORTED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) +#define SUPPORTED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) +#define SUPPORTED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) +/* Please do not define any new SUPPORTED_* macro for bits > 31, see + * notice above. + */ + +/* + * DEPRECATED macros. Please migrate to + * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT + * define any new ADERTISE_* macro for bits > 31. + */ +#define ADVERTISED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half) +#define ADVERTISED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full) +#define ADVERTISED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half) +#define ADVERTISED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full) +#define ADVERTISED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half) +#define ADVERTISED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full) +#define ADVERTISED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg) +#define ADVERTISED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP) +#define ADVERTISED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI) +#define ADVERTISED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII) +#define ADVERTISED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE) +#define ADVERTISED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC) +#define ADVERTISED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full) +#define ADVERTISED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause) +#define ADVERTISED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause) +#define ADVERTISED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full) +#define ADVERTISED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane) +#define ADVERTISED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full) +#define ADVERTISED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full) +#define ADVERTISED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full) +#define ADVERTISED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC) +#define ADVERTISED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full) +#define ADVERTISED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full) +#define ADVERTISED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full) +#define ADVERTISED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full) +#define ADVERTISED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full) +#define ADVERTISED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full) +#define ADVERTISED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full) +#define ADVERTISED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full) +#define ADVERTISED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full) +#define ADVERTISED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full) +/* Please do not define any new ADVERTISED_* macro for bits > 31, see + * notice above. + */ /* The following are all involved in forcing a particular link * mode for the device for setting things. When getting the @@ -1533,4 +1604,123 @@ enum ethtool_reset_flags { }; #define ETH_RESET_SHARED_SHIFT 16 + +/** + * struct ethtool_link_settings - link control and status + * + * IMPORTANT, Backward compatibility notice: When implementing new + * user-space tools, please first try %ETHTOOL_GLINKSETTINGS, and + * if it succeeds use %ETHTOOL_SLINKSETTINGS to change link + * settings; do not use %ETHTOOL_SSET if %ETHTOOL_GLINKSETTINGS + * succeeded: stick to %ETHTOOL_GLINKSETTINGS/%SLINKSETTINGS in + * that case. Conversely, if %ETHTOOL_GLINKSETTINGS fails, use + * %ETHTOOL_GSET to query and %ETHTOOL_SSET to change link + * settings; do not use %ETHTOOL_SLINKSETTINGS if + * %ETHTOOL_GLINKSETTINGS failed: stick to + * %ETHTOOL_GSET/%ETHTOOL_SSET in that case. + * + * @cmd: Command number = %ETHTOOL_GLINKSETTINGS or %ETHTOOL_SLINKSETTINGS + * @speed: Link speed (Mbps) + * @duplex: Duplex mode; one of %DUPLEX_* + * @port: Physical connector type; one of %PORT_* + * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not + * applicable. For clause 45 PHYs this is the PRTAD. + * @autoneg: Enable/disable autonegotiation and auto-detection; + * either %AUTONEG_DISABLE or %AUTONEG_ENABLE + * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO + * protocols supported by the interface; 0 if unknown. + * Read-only. + * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of + * %ETH_TP_MDI_*. If the status is unknown or not applicable, the + * value will be %ETH_TP_MDI_INVALID. Read-only. + * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of + * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads + * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected. + * When written successfully, the link should be renegotiated if + * necessary. + * @link_mode_masks_nwords: Number of 32-bit words for each of the + * supported, advertising, lp_advertising link mode bitmaps. For + * %ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user + * (>= 0); on return, if handshake in progress, negative if + * request size unsupported by kernel: absolute value indicates + * kernel recommended size and cmd field is 0, as well as all the + * other fields; otherwise (handshake completed), strictly + * positive to indicate size used by kernel and cmd field is + * %ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For + * %ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive + * value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise + * refused. For drivers: ignore this field (use kernel's + * __ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will + * be overwritten by kernel. + * @supported: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features for which the interface + * supports autonegotiation or auto-detection. Read-only. + * @advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features that are advertised through + * autonegotiation or enabled for auto-detection. + * @lp_advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, and other + * link features that the link partner advertised through + * autonegotiation; 0 if unknown or not applicable. Read-only. + * + * If autonegotiation is disabled, the speed and @duplex represent the + * fixed link mode and are writable if the driver supports multiple + * link modes. If it is enabled then they are read-only; if the link + * is up they represent the negotiated link mode; if the link is down, + * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and + * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode. + * + * Some hardware interfaces may have multiple PHYs and/or physical + * connectors fitted or do not allow the driver to detect which are + * fitted. For these interfaces @port and/or @phy_address may be + * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE. + * Otherwise, attempts to write different values may be ignored or + * rejected. + * + * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt + * are not available in %ethtool_link_settings. Until all drivers are + * converted to ignore them or to the new %ethtool_link_settings API, + * for both queries and changes, users should always try + * %ETHTOOL_GLINKSETTINGS first, and if it fails with -ENOTSUPP stick + * only to %ETHTOOL_GSET and %ETHTOOL_SSET consistently. If it + * succeeds, then users should stick to %ETHTOOL_GLINKSETTINGS and + * %ETHTOOL_SLINKSETTINGS (which would support drivers implementing + * either %ethtool_cmd or %ethtool_link_settings). + * + * Users should assume that all fields not marked read-only are + * writable and subject to validation by the driver. They should use + * %ETHTOOL_GLINKSETTINGS to get the current values before making specific + * changes and then applying them with %ETHTOOL_SLINKSETTINGS. + * + * Drivers that implement %get_link_ksettings and/or + * %set_link_ksettings should ignore the @cmd + * and @link_mode_masks_nwords fields (any change to them overwritten + * by kernel), and rely only on kernel's internal + * %__ETHTOOL_LINK_MODE_MASK_NBITS and + * %ethtool_link_mode_mask_t. Drivers that implement + * %set_link_ksettings() should validate all fields other than @cmd + * and @link_mode_masks_nwords that are not described as read-only or + * deprecated, and must ignore all fields described as read-only. + */ +struct ethtool_link_settings { + __u32 cmd; + __u32 speed; + __u8 duplex; + __u8 port; + __u8 phy_address; + __u8 autoneg; + __u8 mdio_support; + __u8 eth_tp_mdix; + __u8 eth_tp_mdix_ctrl; + __s8 link_mode_masks_nwords; + __u32 reserved[8]; + __u32 link_mode_masks[0]; + /* layout of link_mode_masks fields: + * __u32 map_supported[link_mode_masks_nwords]; + * __u32 map_advertising[link_mode_masks_nwords]; + * __u32 map_lp_advertising[link_mode_masks_nwords]; + */ +}; #endif /* _UAPI_LINUX_ETHTOOL_H */ -- cgit v1.2.3 From 17605b961f766757fddec20810453fc51b266e77 Mon Sep 17 00:00:00 2001 From: David Decotigny Date: Wed, 24 Feb 2016 10:58:07 -0800 Subject: net: rdma: use __ethtool_get_ksettings Signed-off-by: David Decotigny Signed-off-by: David S. Miller --- include/rdma/ib_addr.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index c34c9002460c..931a47ba4571 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -262,24 +262,22 @@ static inline enum ib_mtu iboe_get_mtu(int mtu) static inline int iboe_get_rate(struct net_device *dev) { - struct ethtool_cmd cmd; - u32 speed; + struct ethtool_link_ksettings cmd; int err; rtnl_lock(); - err = __ethtool_get_settings(dev, &cmd); + err = __ethtool_get_link_ksettings(dev, &cmd); rtnl_unlock(); if (err) return IB_RATE_PORT_CURRENT; - speed = ethtool_cmd_speed(&cmd); - if (speed >= 40000) + if (cmd.base.speed >= 40000) return IB_RATE_40_GBPS; - else if (speed >= 30000) + else if (cmd.base.speed >= 30000) return IB_RATE_30_GBPS; - else if (speed >= 20000) + else if (cmd.base.speed >= 20000) return IB_RATE_20_GBPS; - else if (speed >= 10000) + else if (cmd.base.speed >= 10000) return IB_RATE_10_GBPS; else return IB_RATE_PORT_CURRENT; -- cgit v1.2.3 From 3237fc63a3297d472a8cec33cb914f20570cfc23 Mon Sep 17 00:00:00 2001 From: David Decotigny Date: Wed, 24 Feb 2016 10:58:11 -0800 Subject: net: ethtool: remove unused __ethtool_get_settings replaced by __ethtool_get_link_ksettings. Signed-off-by: David Decotigny Signed-off-by: David S. Miller --- include/linux/ethtool.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 8a400a54c92e..e2b7bf27c03e 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -150,10 +150,6 @@ extern int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings); -/* DEPRECATED, use __ethtool_get_link_ksettings */ -extern int __ethtool_get_settings(struct net_device *dev, - struct ethtool_cmd *cmd); - /** * struct ethtool_ops - optional netdev operations * @get_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings -- cgit v1.2.3 From 3f2fb9a834cb1fcddbae22deca7fde136944dc89 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 24 Feb 2016 11:47:02 -0800 Subject: net: l3mdev: address selection should only consider devices in L3 domain David Lamparter noted a use case where the source address selection fails to pick an address from a VRF interface - unnumbered interfaces. Relevant commands from his script: ip addr add 9.9.9.9/32 dev lo ip link set lo up ip link add name vrf0 type vrf table 101 ip rule add oif vrf0 table 101 ip rule add iif vrf0 table 101 ip link set vrf0 up ip addr add 10.0.0.3/32 dev vrf0 ip link add name dummy2 type dummy ip link set dummy2 master vrf0 up --> note dummy2 has no address - unnumbered device ip route add 10.2.2.2/32 dev dummy2 table 101 ip neigh add 10.2.2.2 dev dummy2 lladdr 02:00:00:00:00:02 tcpdump -ni dummy2 & And using ping instead of his socat example: $ ping -I vrf0 -c1 10.2.2.2 ping: Warning: source address might be selected on device other than vrf0. PING 10.2.2.2 (10.2.2.2) from 9.9.9.9 vrf0: 56(84) bytes of data. >From tcpdump: 12:57:29.449128 IP 9.9.9.9 > 10.2.2.2: ICMP echo request, id 2491, seq 1, length 64 Note the source address is from lo and is not a VRF local address. With this patch: $ ping -I vrf0 -c1 10.2.2.2 PING 10.2.2.2 (10.2.2.2) from 10.0.0.3 vrf0: 56(84) bytes of data. >From tcpdump: 12:59:25.096426 IP 10.0.0.3 > 10.2.2.2: ICMP echo request, id 2113, seq 1, length 64 Now the source address comes from vrf0. The ipv4 function for selecting source address takes a const argument. Removing the const requires touching a lot of places, so instead l3mdev_master_ifindex_rcu is changed to take a const argument and then do the typecast to non-const as required by netdev_master_upper_dev_get_rcu. This is similar to what l3mdev_fib_table_rcu does. IPv6 for unnumbered interfaces appears to be selecting the addresses properly. Cc: David Lamparter Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/l3mdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 5567d46b3cff..c43a9c73de5e 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -39,7 +39,7 @@ struct l3mdev_ops { #ifdef CONFIG_NET_L3_MASTER_DEV -int l3mdev_master_ifindex_rcu(struct net_device *dev); +int l3mdev_master_ifindex_rcu(const struct net_device *dev); static inline int l3mdev_master_ifindex(struct net_device *dev) { int ifindex; @@ -179,7 +179,7 @@ struct dst_entry *l3mdev_rt6_dst_by_oif(struct net *net, #else -static inline int l3mdev_master_ifindex_rcu(struct net_device *dev) +static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev) { return 0; } -- cgit v1.2.3 From b07edbe1cf3dae9ba81f24888e2f2a9dbe778918 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Feb 2016 17:24:08 +0100 Subject: netfilter: meta: add PRANDOM support Can be used to randomly match packets e.g. for statistic traffic sampling. See commit 3ad0040573b0c00f8848 ("bpf: split state from prandom_u32() and consolidate {c, e}BPF prngs") for more info why this doesn't use prandom_u32 directly. Unlike bpf nft_meta can be built as a module, so add an EXPORT_SYMBOL for prandom_seed_full_state too. Cc: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index be41ffc128b8..b19be0a098c0 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -681,6 +681,7 @@ enum nft_exthdr_attributes { * @NFT_META_IIFGROUP: packet input interface group * @NFT_META_OIFGROUP: packet output interface group * @NFT_META_CGROUP: socket control group (skb->sk->sk_classid) + * @NFT_META_PRANDOM: a 32bit pseudo-random number */ enum nft_meta_keys { NFT_META_LEN, @@ -707,6 +708,7 @@ enum nft_meta_keys { NFT_META_IIFGROUP, NFT_META_OIFGROUP, NFT_META_CGROUP, + NFT_META_PRANDOM, }; /** -- cgit v1.2.3 From 86a7996cc8a078793670d82ed97d5a99bb4e8496 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 25 Feb 2016 14:55:00 -0800 Subject: net_sched: introduce qdisc_replace() helper Remove nearly duplicated code and prepare for the following patch. Cc: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/sch_generic.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 636a362a0e03..8fdad9f7a2fb 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -707,6 +707,23 @@ static inline void qdisc_reset_queue(struct Qdisc *sch) sch->qstats.backlog = 0; } +static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, + struct Qdisc **pold) +{ + struct Qdisc *old; + + sch_tree_lock(sch); + old = *pold; + *pold = new; + if (old != NULL) { + qdisc_tree_decrease_qlen(old, old->q.qlen); + qdisc_reset(old); + } + sch_tree_unlock(sch); + + return old; +} + static inline unsigned int __qdisc_queue_drop(struct Qdisc *sch, struct sk_buff_head *list) { -- cgit v1.2.3 From 2ccccf5fb43ff62b2b96cc58d95fc0b3596516e4 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 25 Feb 2016 14:55:01 -0800 Subject: net_sched: update hierarchical backlog too When the bottom qdisc decides to, for example, drop some packet, it calls qdisc_tree_decrease_qlen() to update the queue length for all its ancestors, we need to update the backlog too to keep the stats on root qdisc accurate. Cc: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/codel.h | 4 ++++ include/net/sch_generic.h | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/codel.h b/include/net/codel.h index 267e70210061..d168aca115cc 100644 --- a/include/net/codel.h +++ b/include/net/codel.h @@ -162,12 +162,14 @@ struct codel_vars { * struct codel_stats - contains codel shared variables and stats * @maxpacket: largest packet we've seen so far * @drop_count: temp count of dropped packets in dequeue() + * @drop_len: bytes of dropped packets in dequeue() * ecn_mark: number of packets we ECN marked instead of dropping * ce_mark: number of packets CE marked because sojourn time was above ce_threshold */ struct codel_stats { u32 maxpacket; u32 drop_count; + u32 drop_len; u32 ecn_mark; u32 ce_mark; }; @@ -308,6 +310,7 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch, vars->rec_inv_sqrt); goto end; } + stats->drop_len += qdisc_pkt_len(skb); qdisc_drop(skb, sch); stats->drop_count++; skb = dequeue_func(vars, sch); @@ -330,6 +333,7 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch, if (params->ecn && INET_ECN_set_ce(skb)) { stats->ecn_mark++; } else { + stats->drop_len += qdisc_pkt_len(skb); qdisc_drop(skb, sch); stats->drop_count++; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 8fdad9f7a2fb..e5bba897d206 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -396,7 +396,8 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc); void qdisc_reset(struct Qdisc *qdisc); void qdisc_destroy(struct Qdisc *qdisc); -void qdisc_tree_decrease_qlen(struct Qdisc *qdisc, unsigned int n); +void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n, + unsigned int len); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops); struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, @@ -716,7 +717,7 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, old = *pold; *pold = new; if (old != NULL) { - qdisc_tree_decrease_qlen(old, old->q.qlen); + qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog); qdisc_reset(old); } sch_tree_unlock(sch); -- cgit v1.2.3 From 871b642adebe300be2e50aa5f65a418510f636ec Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Feb 2016 10:45:37 +0100 Subject: netdev: introduce ndo_set_rx_headroom This method allows the controlling device (i.e. the bridge) to specify additional headroom to be allocated for skb head on frame reception. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/netdevice.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e52077ffe5ed..efe7cec111fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1093,6 +1093,12 @@ struct tc_to_netdev { * This function is used to get egress tunnel information for given skb. * This is useful for retrieving outer tunnel header parameters while * sampling packet. + * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); + * This function is used to specify the headroom that the skb must + * consider when allocation skb during packet reception. Setting + * appropriate rx headroom value allows avoiding skb head copy on + * forward. Setting a negative value reset the rx headroom to the + * default value. * */ struct net_device_ops { @@ -1278,6 +1284,8 @@ struct net_device_ops { bool proto_down); int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); + void (*ndo_set_rx_headroom)(struct net_device *dev, + int needed_headroom); }; /** @@ -1315,6 +1323,8 @@ struct net_device_ops { * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device * @IFF_TEAM: device is a team device * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured + * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external + * entity (i.e. the master device for bridged veth) */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1343,6 +1353,7 @@ enum netdev_priv_flags { IFF_L3MDEV_SLAVE = 1<<23, IFF_TEAM = 1<<24, IFF_RXFH_CONFIGURED = 1<<25, + IFF_PHONY_HEADROOM = 1<<26, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1937,6 +1948,26 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, void *accel_priv); +/* returns the headroom that the master device needs to take in account + * when forwarding to this dev + */ +static inline unsigned netdev_get_fwd_headroom(struct net_device *dev) +{ + return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom; +} + +static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr) +{ + if (dev->netdev_ops->ndo_set_rx_headroom) + dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr); +} + +/* set the device rx headroom to the dev's default */ +static inline void netdev_reset_rx_headroom(struct net_device *dev) +{ + netdev_set_rx_headroom(dev, -1); +} + /* * Net namespace inlines */ -- cgit v1.2.3 From 6843e7a2abe7cac10c19702ffec90018df6f040d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 26 Feb 2016 07:53:49 -0800 Subject: net: sched: consolidate offload decision in cls_u32 The offload decision was originally very basic and tied to if the dev implemented the appropriate ndo op hook. The next step is to allow the user to more flexibly define if any paticular rule should be offloaded or not. In order to have this logic in one function lift the current check into a helper routine tc_should_offload(). Signed-off-by: John Fastabend Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 2121df574262..e64d20b81047 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -392,4 +392,9 @@ struct tc_cls_u32_offload { }; }; +static inline bool tc_should_offload(struct net_device *dev) +{ + return dev->netdev_ops->ndo_setup_tc; +} + #endif -- cgit v1.2.3 From 2b6ab0d3aae6bf1e08118060b0c5565778cd6b21 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 26 Feb 2016 07:54:13 -0800 Subject: net: cls_u32: move TC offload feature bit into cls_u32 offload logic In the original series drivers would get offload requests for cls_u32 rules even if the feature bit is disabled. This meant the driver had to do a boiler plate check on the feature bit before adding/deleting the rule. This patch lifts the check into the core code and removes it from the driver specific case. Signed-off-by: John Fastabend Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e64d20b81047..6096e96fb78b 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -394,6 +394,9 @@ struct tc_cls_u32_offload { static inline bool tc_should_offload(struct net_device *dev) { + if (!(dev->features & NETIF_F_HW_TC)) + return false; + return dev->netdev_ops->ndo_setup_tc; } -- cgit v1.2.3 From 9e8ce79cd711d4dfe09d8bba6822cd9bb7db96bd Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 26 Feb 2016 07:54:39 -0800 Subject: net: sched: cls_u32 add bit to specify software only rules In the initial implementation the only way to stop a rule from being inserted into the hardware table was via the device feature flag. However this doesn't work well when working on an end host system where packets are expect to hit both the hardware and software datapaths. For example we can imagine a rule that will match an IP address and increment a field. If we install this rule in both hardware and software we may increment the field twice. To date we have only added support for the drop action so we have been able to ignore these cases. But as we extend the action support we will hit this example plus more such cases. Arguably these are not even corner cases in many working systems these cases will be common. To avoid forcing the driver to always abort (i.e. the above example) this patch adds a flag to add a rule in software only. A careful user can use this flag to build software and hardware datapaths that work together. One example we have found particularly useful is to use hardware resources to set the skb->mark on the skb when the match may be expensive to run in software but a mark lookup in a hash table is cheap. The idea here is hardware can do in one lookup what the u32 classifier may need to traverse multiple lists and hash tables to compute. The flag is only passed down on inserts. On deletion to avoid stale references in hardware we always try to remove a rule if it exists. The flags field is part of the classifier specific options. Although it is tempting to lift this into the generic structure doing this proves difficult do to how the tc netlink attributes are implemented along with how the dump/change routines are called. There is also precedence for putting seemingly generic pieces in the specific classifier options such as TCA_U32_POLICE, TCA_U32_ACT, etc. So although not ideal I've left FLAGS in the u32 options as well as it simplifies the code greatly and user space has already learned how to manage these bits ala 'tc' tool. Another thing if trying to update a rule we require the flags to be unchanged. This is to force user space, software u32 and the hardware u32 to keep in sync. Thanks to Simon Horman for catching this case. Signed-off-by: John Fastabend Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 13 +++++++++++-- include/uapi/linux/pkt_cls.h | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6096e96fb78b..bea14eee373e 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -392,12 +392,21 @@ struct tc_cls_u32_offload { }; }; -static inline bool tc_should_offload(struct net_device *dev) +/* tca flags definitions */ +#define TCA_CLS_FLAGS_SKIP_HW 1 + +static inline bool tc_should_offload(struct net_device *dev, u32 flags) { if (!(dev->features & NETIF_F_HW_TC)) return false; - return dev->netdev_ops->ndo_setup_tc; + if (flags & TCA_CLS_FLAGS_SKIP_HW) + return false; + + if (!dev->netdev_ops->ndo_setup_tc) + return false; + + return true; } #endif diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 439873775d49..9874f5680926 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -172,6 +172,7 @@ enum { TCA_U32_INDEV, TCA_U32_PCNT, TCA_U32_MARK, + TCA_U32_FLAGS, __TCA_U32_MAX }; -- cgit v1.2.3 From bfcd3a46617209454cfc0947ab093e37fd1e84ef Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 26 Feb 2016 17:32:23 +0100 Subject: Introduce devlink infrastructure Introduce devlink infrastructure for drivers to register and expose to userspace via generic Netlink interface. There are two basic objects defined: devlink - one instance for every "parent device", for example switch ASIC devlink port - one instance for every physical port of the device. This initial portion implements basic get/dump of objects to userspace. Also, port splitter and port type setting is implemented. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 140 +++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/devlink.h | 72 ++++++++++++++++++++++ 2 files changed, 212 insertions(+) create mode 100644 include/net/devlink.h create mode 100644 include/uapi/linux/devlink.h (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h new file mode 100644 index 000000000000..c37d257891d6 --- /dev/null +++ b/include/net/devlink.h @@ -0,0 +1,140 @@ +/* + * include/net/devlink.h - Network physical device Netlink interface + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#ifndef _NET_DEVLINK_H_ +#define _NET_DEVLINK_H_ + +#include +#include +#include +#include +#include +#include +#include + +struct devlink_ops; + +struct devlink { + struct list_head list; + struct list_head port_list; + const struct devlink_ops *ops; + struct device *dev; + possible_net_t _net; + char priv[0] __aligned(NETDEV_ALIGN); +}; + +struct devlink_port { + struct list_head list; + struct devlink *devlink; + unsigned index; + bool registered; + enum devlink_port_type type; + enum devlink_port_type desired_type; + void *type_dev; + bool split; + u32 split_group; +}; + +struct devlink_ops { + size_t priv_size; + int (*port_type_set)(struct devlink_port *devlink_port, + enum devlink_port_type port_type); + int (*port_split)(struct devlink *devlink, unsigned int port_index, + unsigned int count); + int (*port_unsplit)(struct devlink *devlink, unsigned int port_index); +}; + +static inline void *devlink_priv(struct devlink *devlink) +{ + BUG_ON(!devlink); + return &devlink->priv; +} + +static inline struct devlink *priv_to_devlink(void *priv) +{ + BUG_ON(!priv); + return container_of(priv, struct devlink, priv); +} + +struct ib_device; + +#if IS_ENABLED(CONFIG_NET_DEVLINK) + +struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); +int devlink_register(struct devlink *devlink, struct device *dev); +void devlink_unregister(struct devlink *devlink); +void devlink_free(struct devlink *devlink); +int devlink_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index); +void devlink_port_unregister(struct devlink_port *devlink_port); +void devlink_port_type_eth_set(struct devlink_port *devlink_port, + struct net_device *netdev); +void devlink_port_type_ib_set(struct devlink_port *devlink_port, + struct ib_device *ibdev); +void devlink_port_type_clear(struct devlink_port *devlink_port); +void devlink_port_split_set(struct devlink_port *devlink_port, + u32 split_group); + +#else + +static inline struct devlink *devlink_alloc(const struct devlink_ops *ops, + size_t priv_size) +{ + return kzalloc(sizeof(struct devlink) + priv_size, GFP_KERNEL); +} + +static inline int devlink_register(struct devlink *devlink, struct device *dev) +{ + return 0; +} + +static inline void devlink_unregister(struct devlink *devlink) +{ +} + +static inline void devlink_free(struct devlink *devlink) +{ + kfree(devlink); +} + +static inline int devlink_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index) +{ + return 0; +} + +static inline void devlink_port_unregister(struct devlink_port *devlink_port) +{ +} + +static inline void devlink_port_type_eth_set(struct devlink_port *devlink_port, + struct net_device *netdev) +{ +} + +static inline void devlink_port_type_ib_set(struct devlink_port *devlink_port, + struct ib_device *ibdev) +{ +} + +static inline void devlink_port_type_clear(struct devlink_port *devlink_port) +{ +} + +static inline void devlink_port_split_set(struct devlink_port *devlink_port, + u32 split_group) +{ +} + +#endif + +#endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h new file mode 100644 index 000000000000..c9fee5781eb1 --- /dev/null +++ b/include/uapi/linux/devlink.h @@ -0,0 +1,72 @@ +/* + * include/uapi/linux/devlink.h - Network physical device Netlink interface + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_DEVLINK_H_ +#define _UAPI_LINUX_DEVLINK_H_ + +#define DEVLINK_GENL_NAME "devlink" +#define DEVLINK_GENL_VERSION 0x1 +#define DEVLINK_GENL_MCGRP_CONFIG_NAME "config" + +enum devlink_command { + /* don't change the order or add anything between, this is ABI! */ + DEVLINK_CMD_UNSPEC, + + DEVLINK_CMD_GET, /* can dump */ + DEVLINK_CMD_SET, + DEVLINK_CMD_NEW, + DEVLINK_CMD_DEL, + + DEVLINK_CMD_PORT_GET, /* can dump */ + DEVLINK_CMD_PORT_SET, + DEVLINK_CMD_PORT_NEW, + DEVLINK_CMD_PORT_DEL, + + DEVLINK_CMD_PORT_SPLIT, + DEVLINK_CMD_PORT_UNSPLIT, + + /* add new commands above here */ + + __DEVLINK_CMD_MAX, + DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 +}; + +enum devlink_port_type { + DEVLINK_PORT_TYPE_NOTSET, + DEVLINK_PORT_TYPE_AUTO, + DEVLINK_PORT_TYPE_ETH, + DEVLINK_PORT_TYPE_IB, +}; + +enum devlink_attr { + /* don't change the order or add anything between, this is ABI! */ + DEVLINK_ATTR_UNSPEC, + + /* bus name + dev name together are a handle for devlink entity */ + DEVLINK_ATTR_BUS_NAME, /* string */ + DEVLINK_ATTR_DEV_NAME, /* string */ + + DEVLINK_ATTR_PORT_INDEX, /* u32 */ + DEVLINK_ATTR_PORT_TYPE, /* u16 */ + DEVLINK_ATTR_PORT_DESIRED_TYPE, /* u16 */ + DEVLINK_ATTR_PORT_NETDEV_IFINDEX, /* u32 */ + DEVLINK_ATTR_PORT_NETDEV_NAME, /* string */ + DEVLINK_ATTR_PORT_IBDEV_NAME, /* string */ + DEVLINK_ATTR_PORT_SPLIT_COUNT, /* u32 */ + DEVLINK_ATTR_PORT_SPLIT_GROUP, /* u32 */ + + /* add new attributes above here, update the policy in devlink.c */ + + __DEVLINK_ATTR_MAX, + DEVLINK_ATTR_MAX = __DEVLINK_ATTR_MAX - 1 +}; + +#endif /* _UAPI_LINUX_DEVLINK_H_ */ -- cgit v1.2.3 From 09d4d087cd4869859fcc5dfc692f0830550a1b48 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 26 Feb 2016 17:32:24 +0100 Subject: mlx4: Implement devlink interface Implement newly introduced devlink interface. Add devlink port instances for every port and set the port types accordingly. Signed-off-by: Jiri Pirko v2->v3: -add dev param to devlink_register (api change) Signed-off-by: David S. Miller --- include/linux/mlx4/driver.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h index 2e8af001c5da..bd0e7075ea6d 100644 --- a/include/linux/mlx4/driver.h +++ b/include/linux/mlx4/driver.h @@ -33,6 +33,7 @@ #ifndef MLX4_DRIVER_H #define MLX4_DRIVER_H +#include #include struct mlx4_dev; @@ -89,6 +90,8 @@ int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p); void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port); +struct devlink_port *mlx4_get_devlink_port(struct mlx4_dev *dev, int port); + static inline u64 mlx4_mac_to_u64(u8 *addr) { u64 mac = 0; -- cgit v1.2.3 From fb2dabad69f099fb9c03a44276778911da50ba29 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 26 Feb 2016 13:16:00 -0500 Subject: net: dsa: support VLAN filtering switchdev attr When a user explicitly requests VLAN filtering with something like: # echo 1 > /sys/class/net//bridge/vlan_filtering Switchdev propagates a SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING port attribute. Add support for it in the DSA layer with a new port_vlan_filtering function to let drivers toggle 802.1Q filtering on user demand. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 3dd54867174a..26c0a3fa009a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -305,6 +305,8 @@ struct dsa_switch_driver { /* * VLAN support */ + int (*port_vlan_filtering)(struct dsa_switch *ds, int port, + bool vlan_filtering); int (*port_vlan_prepare)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans); -- cgit v1.2.3 From 7f0aec7a668419bdbff12de6e8016544f874e708 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 26 Feb 2016 21:20:01 +0100 Subject: bridge: mcast: use names for the different multicast_router types Using raw values makes it difficult to extend and also understand the code, give them names and do explicit per-option manipulation in br_multicast_set_port_router. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 0890b217580d..e47f3bc7f323 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -177,6 +177,13 @@ enum { }; #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) +/* multicast router types */ +enum { + MDB_RTR_TYPE_DISABLED, + MDB_RTR_TYPE_TEMP_QUERY, + MDB_RTR_TYPE_PERM, +}; + enum { MDBA_ROUTER_UNSPEC, MDBA_ROUTER_PORT, -- cgit v1.2.3 From a55d8246abcc910346771175b521ee2bce5a69b3 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 26 Feb 2016 21:20:03 +0100 Subject: bridge: mcast: add support for temporary port router Add support for a temporary router port which doesn't depend only on the incoming query. It can be refreshed if set to the same value, which is a no-op for the rest. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index e47f3bc7f323..74ee03a47e79 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -182,6 +182,7 @@ enum { MDB_RTR_TYPE_DISABLED, MDB_RTR_TYPE_TEMP_QUERY, MDB_RTR_TYPE_PERM, + MDB_RTR_TYPE_TEMP }; enum { -- cgit v1.2.3 From 59f78f9f6c2e80dcf0f520be85b660f856217b79 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 26 Feb 2016 21:20:04 +0100 Subject: bridge: mcast: add support for more router port information dumping Allow for more multicast router port information to be dumped such as timer and type attributes. For that that purpose we need to extend the MDBA_ROUTER_PORT attribute similar to how it was done for the mdb entries recently. The new format is thus: [MDBA_ROUTER_PORT] = { <- nested attribute u32 ifindex <- router port ifindex for user-space compatibility [MDBA_ROUTER_PATTR attributes] } This way it remains compatible with older users (they'll simply retrieve the u32 in the beginning) and new users can parse the remaining attributes. It would also allow to add future extensions to the router port without breaking compatibility. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 74ee03a47e79..0536eefff9bf 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -144,7 +144,10 @@ struct bridge_vlan_info { * } * } * [MDBA_ROUTER] = { - * [MDBA_ROUTER_PORT] + * [MDBA_ROUTER_PORT] = { + * u32 ifindex + * [MDBA_ROUTER_PATTR attributes] + * } * } */ enum { @@ -192,6 +195,15 @@ enum { }; #define MDBA_ROUTER_MAX (__MDBA_ROUTER_MAX - 1) +/* router port attributes */ +enum { + MDBA_ROUTER_PATTR_UNSPEC, + MDBA_ROUTER_PATTR_TIMER, + MDBA_ROUTER_PATTR_TYPE, + __MDBA_ROUTER_PATTR_MAX +}; +#define MDBA_ROUTER_PATTR_MAX (__MDBA_ROUTER_PATTR_MAX - 1) + struct br_port_msg { __u8 family; __u32 ifindex; -- cgit v1.2.3 From ef6980b6becb1afd9d82a4f043749a10ae81bf14 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 27 Feb 2016 08:08:54 -0500 Subject: introduce IFE action This action allows for a sending side to encapsulate arbitrary metadata which is decapsulated by the receiving end. The sender runs in encoding mode and the receiver in decode mode. Both sender and receiver must specify the same ethertype. At some point we hope to have a registered ethertype and we'll then provide a default so the user doesnt have to specify it. For now we enforce the user specify it. Lets show example usage where we encode icmp from a sender towards a receiver with an skbmark of 17; both sender and receiver use ethertype of 0xdead to interop. YYYY: Lets start with Receiver-side policy config: xxx: add an ingress qdisc sudo tc qdisc add dev $ETH ingress xxx: any packets with ethertype 0xdead will be subjected to ife decoding xxx: we then restart the classification so we can match on icmp at prio 3 sudo $TC filter add dev $ETH parent ffff: prio 2 protocol 0xdead \ u32 match u32 0 0 flowid 1:1 \ action ife decode reclassify xxx: on restarting the classification from above if it was an icmp xxx: packet, then match it here and continue to the next rule at prio 4 xxx: which will match based on skb mark of 17 sudo tc filter add dev $ETH parent ffff: prio 3 protocol ip \ u32 match ip protocol 1 0xff flowid 1:1 \ action continue xxx: match on skbmark of 0x11 (decimal 17) and accept sudo tc filter add dev $ETH parent ffff: prio 4 protocol ip \ handle 0x11 fw flowid 1:1 \ action ok xxx: Lets show the decoding policy sudo tc -s filter ls dev $ETH parent ffff: protocol 0xdead xxx: filter pref 2 u32 filter pref 2 u32 fh 800: ht divisor 1 filter pref 2 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 (rule hit 0 success 0) match 00000000/00000000 at 0 (success 0 ) action order 1: ife decode action reclassify index 1 ref 1 bind 1 installed 14 sec used 14 sec type: 0x0 Metadata: allow mark allow hash allow prio allow qmap Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 xxx: Observe that above lists all metadatum it can decode. Typically these submodules will already be compiled into a monolithic kernel or loaded as modules YYYY: Lets show the sender side now .. xxx: Add an egress qdisc on the sender netdev sudo tc qdisc add dev $ETH root handle 1: prio xxx: xxx: Match all icmp packets to 192.168.122.237/24, then xxx: tag the packet with skb mark of decimal 17, then xxx: Encode it with: xxx: ethertype 0xdead xxx: add skb->mark to whitelist of metadatum to send xxx: rewrite target dst MAC address to 02:15:15:15:15:15 xxx: sudo $TC filter add dev $ETH parent 1: protocol ip prio 10 u32 \ match ip dst 192.168.122.237/24 \ match ip protocol 1 0xff \ flowid 1:2 \ action skbedit mark 17 \ action ife encode \ type 0xDEAD \ allow mark \ dst 02:15:15:15:15:15 xxx: Lets show the encoding policy sudo tc -s filter ls dev $ETH parent 1: protocol ip xxx: filter pref 10 u32 filter pref 10 u32 fh 800: ht divisor 1 filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:2 (rule hit 0 success 0) match c0a87aed/ffffffff at 16 (success 0 ) match 00010000/00ff0000 at 8 (success 0 ) action order 1: skbedit mark 17 index 6 ref 1 bind 1 Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 action order 2: ife encode action pipe index 3 ref 1 bind 1 dst MAC: 02:15:15:15:15:15 type: 0xDEAD Metadata: allow mark Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 xxx: test by sending ping from sender to destination Signed-off-by: Jamal Hadi Salim Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/tc_act/tc_ife.h | 61 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/tc_act/tc_ife.h | 38 ++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 include/net/tc_act/tc_ife.h create mode 100644 include/uapi/linux/tc_act/tc_ife.h (limited to 'include') diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h new file mode 100644 index 000000000000..dc9a09aefb33 --- /dev/null +++ b/include/net/tc_act/tc_ife.h @@ -0,0 +1,61 @@ +#ifndef __NET_TC_IFE_H +#define __NET_TC_IFE_H + +#include +#include +#include +#include + +#define IFE_METAHDRLEN 2 +struct tcf_ife_info { + struct tcf_common common; + u8 eth_dst[ETH_ALEN]; + u8 eth_src[ETH_ALEN]; + u16 eth_type; + u16 flags; + /* list of metaids allowed */ + struct list_head metalist; +}; +#define to_ife(a) \ + container_of(a->priv, struct tcf_ife_info, common) + +struct tcf_meta_info { + const struct tcf_meta_ops *ops; + void *metaval; + u16 metaid; + struct list_head metalist; +}; + +struct tcf_meta_ops { + u16 metaid; /*Maintainer provided ID */ + u16 metatype; /*netlink attribute type (look at net/netlink.h) */ + const char *name; + const char *synopsis; + struct list_head list; + int (*check_presence)(struct sk_buff *, struct tcf_meta_info *); + int (*encode)(struct sk_buff *, void *, struct tcf_meta_info *); + int (*decode)(struct sk_buff *, void *, u16 len); + int (*get)(struct sk_buff *skb, struct tcf_meta_info *mi); + int (*alloc)(struct tcf_meta_info *, void *); + void (*release)(struct tcf_meta_info *); + int (*validate)(void *val, int len); + struct module *owner; +}; + +#define MODULE_ALIAS_IFE_META(metan) MODULE_ALIAS("ifemeta" __stringify_1(metan)) + +int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi); +int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi); +int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, + const void *dval); +int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval); +int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval); +int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi); +int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi); +int ife_validate_meta_u32(void *val, int len); +int ife_validate_meta_u16(void *val, int len); +void ife_release_meta_gen(struct tcf_meta_info *mi); +int register_ife_op(struct tcf_meta_ops *mops); +int unregister_ife_op(struct tcf_meta_ops *mops); + +#endif /* __NET_TC_IFE_H */ diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h new file mode 100644 index 000000000000..d648ff66586f --- /dev/null +++ b/include/uapi/linux/tc_act/tc_ife.h @@ -0,0 +1,38 @@ +#ifndef __UAPI_TC_IFE_H +#define __UAPI_TC_IFE_H + +#include +#include + +#define TCA_ACT_IFE 25 +/* Flag bits for now just encoding/decoding; mutually exclusive */ +#define IFE_ENCODE 1 +#define IFE_DECODE 0 + +struct tc_ife { + tc_gen; + __u16 flags; +}; + +/*XXX: We need to encode the total number of bytes consumed */ +enum { + TCA_IFE_UNSPEC, + TCA_IFE_PARMS, + TCA_IFE_TM, + TCA_IFE_DMAC, + TCA_IFE_SMAC, + TCA_IFE_TYPE, + TCA_IFE_METALST, + __TCA_IFE_MAX +}; +#define TCA_IFE_MAX (__TCA_IFE_MAX - 1) + +#define IFE_META_SKBMARK 1 +#define IFE_META_HASHID 2 +#define IFE_META_PRIO 3 +#define IFE_META_QMAP 4 +/*Can be overridden at runtime by module option*/ +#define __IFE_META_MAX 5 +#define IFE_META_MAX (__IFE_META_MAX - 1) + +#endif -- cgit v1.2.3 From 822c868532cae2cc1c51f4f18ab61c194d98aaf6 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 27 Feb 2016 00:32:15 -0800 Subject: net: ipv4: Convert IP network timestamps to be y2038 safe ICMP timestamp messages and IP source route options require timestamps to be in milliseconds modulo 24 hours from midnight UT format. Add inet_current_timestamp() function to support this. The function returns the required timestamp in network byte order. Timestamp calculation is also changed to call ktime_get_real_ts64() which uses struct timespec64. struct timespec64 is y2038 safe. Previously it called getnstimeofday() which uses struct timespec. struct timespec is not y2038 safe. Signed-off-by: Deepa Dinamani Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: James Morris Cc: Patrick McHardy Acked-by: YOSHIFUJI Hideaki Acked-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/net/ip.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index cbb134b2f0e4..fad74d323bd6 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -240,6 +240,8 @@ static inline int inet_is_local_reserved_port(struct net *net, int port) } #endif +__be32 inet_current_timestamp(void); + /* From inetpeer.c */ extern int inet_peer_threshold; extern int inet_peer_minttl; -- cgit v1.2.3 From 6b6c07bdcdc97ccac2596063bfc32a5faddfe884 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 2 Mar 2016 00:13:39 +0200 Subject: net/mlx5: Make command timeout way shorter The command timeout is terribly long, whole two hours. Make it 60s so if things do go wrong, the user gets feedback in relatively short time, so they can take corrective actions and/or investigate using tools and such. Fixes: e126ba97dba9 ('mlx5: Add driver for Mellanox Connect-IB adapters') Signed-off-by: Or Gerlitz Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a815da92d4eb..3388a43b78f6 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -54,7 +54,7 @@ enum { /* one minute for the sake of bringup. Generally, commands must always * complete and we may need to increase this timeout value */ - MLX5_CMD_TIMEOUT_MSEC = 7200 * 1000, + MLX5_CMD_TIMEOUT_MSEC = 60 * 1000, MLX5_CMD_WQ_MAX_NAME = 32, }; -- cgit v1.2.3 From 0ba422410bbf7081c3c7d7b2dcc10e9eb5cb46f7 Mon Sep 17 00:00:00 2001 From: Moshe Lazer Date: Wed, 2 Mar 2016 00:13:40 +0200 Subject: net/mlx5: Fix global UAR mapping Avoid double mapping of io mapped memory, Device page may be mapped to non-cached(NC) or to write-combining(WC). The code before this fix tries to map it both to WC and NC contrary to what stated in Intel's software developer manual. Here we remove the global WC mapping of all UARS "dev->priv.bf_mapping", since UAR mapping should be decided per UAR (e.g we want different mappings for EQs, CQs vs QPs). Caller will now have to choose whether to map via write-combining API or not. mlx5e SQs will choose write-combining in order to perform BlueFlame writes. Fixes: 88a85f99e51f ('TX latency optimization to save DMA reads') Signed-off-by: Moshe Lazer Reviewed-by: Achiad Shochat Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 3388a43b78f6..bb1a880a5bc5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -460,8 +460,6 @@ struct mlx5_priv { struct mlx5_uuar_info uuari; MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock); - struct io_mapping *bf_mapping; - /* pages stuff */ struct workqueue_struct *pg_wq; struct rb_root page_root; @@ -719,7 +717,8 @@ int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn); int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn); int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari); int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari); -int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar); +int mlx5_alloc_map_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar, + bool map_wc); void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar); void mlx5_health_cleanup(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 64d4e3431e686dc37ce388ba531c4c4e866fb141 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Sat, 27 Feb 2016 20:19:54 -0800 Subject: net: remove skb_sender_cpu_clear() After commit 52bd2d62ce67 ("net: better skb->sender_cpu and skb->napi_id cohabitation") skb_sender_cpu_clear() becomes empty and can be removed. Cc: Eric Dumazet Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index eab4f8fbed58..797cefb888fb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1161,10 +1161,6 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) to->l4_hash = from->l4_hash; }; -static inline void skb_sender_cpu_clear(struct sk_buff *skb) -{ -} - #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { -- cgit v1.2.3 From 4ac801b77e6f06e6b12c069fd29216a4102065fb Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Sun, 28 Feb 2016 12:26:52 +0200 Subject: qed: Semantic refactoring of interrupt code Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 3d43c1d4ecef..1f7599c77cd4 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -446,6 +446,12 @@ struct qed_eth_stats { #define RX_PI 0 #define TX_PI(tc) (RX_PI + 1 + tc) +struct qed_sb_cnt_info { + int sb_cnt; + int sb_iov_cnt; + int sb_free_blk; +}; + static inline u16 qed_sb_update_sb_idx(struct qed_sb_info *sb_info) { u32 prod = 0; -- cgit v1.2.3 From a67dd266adf42a24df31380e9da78390bb4d65ef Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Feb 2016 10:08:35 +0100 Subject: netfilter: xtables: prepare for on-demand hook register This change prepares for upcoming on-demand xtables hook registration. We change the protoypes of the register/unregister functions. A followup patch will then add nf_hook_register/unregister calls to the iptables one. Once a hook is registered packets will be picked up, so all assignments of the form net->ipv4.iptable_$table = new_table have to be moved to ip(6)t_register_table, else we can see NULL net->ipv4.iptable_$table later. This patch doesn't change functionality; without this the actual change simply gets too big. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_arp/arp_tables.h | 9 +++++---- include/linux/netfilter_ipv4/ip_tables.h | 9 +++++---- include/linux/netfilter_ipv6/ip6_tables.h | 9 +++++---- 3 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index 6f074db2f23d..029b95e8924e 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -48,10 +48,11 @@ struct arpt_error { } extern void *arpt_alloc_initial_table(const struct xt_table *); -extern struct xt_table *arpt_register_table(struct net *net, - const struct xt_table *table, - const struct arpt_replace *repl); -extern void arpt_unregister_table(struct xt_table *table); +int arpt_register_table(struct net *net, const struct xt_table *table, + const struct arpt_replace *repl, + const struct nf_hook_ops *ops, struct xt_table **res); +void arpt_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops); extern unsigned int arpt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table); diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index aa598f942c01..7bfc5893ec31 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -24,10 +24,11 @@ extern void ipt_init(void) __init; -extern struct xt_table *ipt_register_table(struct net *net, - const struct xt_table *table, - const struct ipt_replace *repl); -extern void ipt_unregister_table(struct net *net, struct xt_table *table); +int ipt_register_table(struct net *net, const struct xt_table *table, + const struct ipt_replace *repl, + const struct nf_hook_ops *ops, struct xt_table **res); +void ipt_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops); /* Standard entry. */ struct ipt_standard { diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 0f76e5c674f9..b21c392d6012 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -25,10 +25,11 @@ extern void ip6t_init(void) __init; extern void *ip6t_alloc_initial_table(const struct xt_table *); -extern struct xt_table *ip6t_register_table(struct net *net, - const struct xt_table *table, - const struct ip6t_replace *repl); -extern void ip6t_unregister_table(struct net *net, struct xt_table *table); +int ip6t_register_table(struct net *net, const struct xt_table *table, + const struct ip6t_replace *repl, + const struct nf_hook_ops *ops, struct xt_table **res); +void ip6t_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops); extern unsigned int ip6t_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table); -- cgit v1.2.3 From b9e69e127397187b70c813a4397cce7afb5e8cb1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Feb 2016 10:08:36 +0100 Subject: netfilter: xtables: don't hook tables by default delay hook registration until the table is being requested inside a namespace. Historically, a particular table (iptables mangle, ip6tables filter, etc) was registered on module load. When netns support was added to iptables only the ip/ip6tables ruleset was made namespace aware, not the actual hook points. This means f.e. that when ipt_filter table/module is loaded on a system, then each namespace on that system has an (empty) iptables filter ruleset. In other words, if a namespace sends a packet, such skb is 'caught' by netfilter machinery and fed to hooking points for that table (i.e. INPUT, FORWARD, etc). Thanks to Eric Biederman, hooks are no longer global, but per namespace. This means that we can avoid allocation of empty ruleset in a namespace and defer hook registration until we need the functionality. We register a tables hook entry points ONLY in the initial namespace. When an iptables get/setockopt is issued inside a given namespace, we check if the table is found in the per-namespace list. If not, we attempt to find it in the initial namespace, and, if found, create an empty default table in the requesting namespace and register the needed hooks. Hook points are destroyed only once namespace is deleted, there is no 'usage count' (it makes no sense since there is no 'remove table' operation in xtables api). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index c5577410c25d..80a305b85323 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -200,6 +200,9 @@ struct xt_table { u_int8_t af; /* address/protocol family */ int priority; /* hook order */ + /* called when table is needed in the given netns */ + int (*table_init)(struct net *net); + /* A unique name... */ const char name[XT_TABLE_MAXNAMELEN]; }; @@ -408,8 +411,7 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) return cnt; } -struct nf_hook_ops *xt_hook_link(const struct xt_table *, nf_hookfn *); -void xt_hook_unlink(const struct xt_table *, struct nf_hook_ops *); +struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *); #ifdef CONFIG_COMPAT #include -- cgit v1.2.3 From af4610c39589d839551da104f7da342d86f23ea0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Feb 2016 10:08:38 +0100 Subject: netfilter: don't call hooks unless needed With the previous patches in place, a netns nf_hook_list might be empty, even if e.g. init_net performs filtering. Thus change nf_hook_thresh to check the hook_list as well before initializing hook_state and calling nf_hook_slow(). We still make use of static keys; if no netfilter modules are loaded list is guaranteed to be empty. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 0ad556726181..9230f9aee896 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -141,22 +141,6 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg); #ifdef HAVE_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; - -static inline bool nf_hook_list_active(struct list_head *hook_list, - u_int8_t pf, unsigned int hook) -{ - if (__builtin_constant_p(pf) && - __builtin_constant_p(hook)) - return static_key_false(&nf_hooks_needed[pf][hook]); - - return !list_empty(hook_list); -} -#else -static inline bool nf_hook_list_active(struct list_head *hook_list, - u_int8_t pf, unsigned int hook) -{ - return !list_empty(hook_list); -} #endif int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state); @@ -177,9 +161,18 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, int (*okfn)(struct net *, struct sock *, struct sk_buff *), int thresh) { - struct list_head *hook_list = &net->nf.hooks[pf][hook]; + struct list_head *hook_list; + +#ifdef HAVE_JUMP_LABEL + if (__builtin_constant_p(pf) && + __builtin_constant_p(hook) && + !static_key_false(&nf_hooks_needed[pf][hook])) + return 1; +#endif + + hook_list = &net->nf.hooks[pf][hook]; - if (nf_hook_list_active(hook_list, pf, hook)) { + if (!list_empty(hook_list)) { struct nf_hook_state state; nf_hook_state_init(&state, hook_list, hook, thresh, -- cgit v1.2.3 From 8a6bf5da1aefdafd60b73d9122c7af9fd2d7bb9c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 1 Mar 2016 19:55:14 +0100 Subject: netfilter: nft_masq: support port range Complete masquerading support by allowing port range selection. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_masq.h | 4 +++- include/uapi/linux/netfilter/nf_tables.h | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nft_masq.h b/include/net/netfilter/nft_masq.h index e2a518b60e19..a3f3c11b2526 100644 --- a/include/net/netfilter/nft_masq.h +++ b/include/net/netfilter/nft_masq.h @@ -2,7 +2,9 @@ #define _NFT_MASQ_H_ struct nft_masq { - u32 flags; + u32 flags; + enum nft_registers sreg_proto_min:8; + enum nft_registers sreg_proto_max:8; }; extern const struct nla_policy nft_masq_policy[]; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b19be0a098c0..eeffde196f80 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -951,10 +951,14 @@ enum nft_nat_attributes { * enum nft_masq_attributes - nf_tables masquerade expression attributes * * @NFTA_MASQ_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) + * @NFTA_MASQ_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers) + * @NFTA_MASQ_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers) */ enum nft_masq_attributes { NFTA_MASQ_UNSPEC, NFTA_MASQ_FLAGS, + NFTA_MASQ_REG_PROTO_MIN, + NFTA_MASQ_REG_PROTO_MAX, __NFTA_MASQ_MAX }; #define NFTA_MASQ_MAX (__NFTA_MASQ_MAX - 1) -- cgit v1.2.3 From afea03656add70a0e00f5b0039f87288c7af8b9f Mon Sep 17 00:00:00 2001 From: Giuseppe Cavallaro Date: Mon, 29 Feb 2016 14:27:28 +0100 Subject: stmmac: rework DMA bus setting and introduce new platform AXI structure This patch restructures the DMA bus settings and this is done by introducing a new platform structure used for programming the AXI Bus Mode Register inside the DMA module. This structure can be populated from device-tree as documented in the binding txt file. After initializing the DMA, the AXI register can be optionally tuned for platform drivers based. This patch also reworks some parameters to make coherent the DMA configuration now that AXI register is introduced. For example, the burst_len is managed by using the mentioned axi support above; so the snps,burst-len parameter has been removed. It makes sense to provide the AAL parameter from DT to Address-Aligned Beats inside the Register0 and review the PBL settings when initialize the engine. For PCI glue, rebuilding the story of this setting, it was added to align a configuration so not for fixing some known problem. No issue raised after this patch. It is safe to use the default burst length instead of tuning it to the maximum value Signed-off-by: Giuseppe Cavallaro Signed-off-by: Alexandre TORGUE Signed-off-by: David S. Miller --- include/linux/stmmac.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index eead8ab93c0a..6e53fa8942a4 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -90,7 +90,21 @@ struct stmmac_dma_cfg { int pbl; int fixed_burst; int mixed_burst; - int burst_len; + bool aal; +}; + +#define AXI_BLEN 7 +struct stmmac_axi { + bool axi_lpi_en; + bool axi_xit_frm; + u32 axi_wr_osr_lmt; + u32 axi_rd_osr_lmt; + bool axi_kbbe; + bool axi_axi_all; + u32 axi_blen[AXI_BLEN]; + bool axi_fb; + bool axi_mb; + bool axi_rb; }; struct plat_stmmacenet_data { @@ -122,5 +136,6 @@ struct plat_stmmacenet_data { int (*init)(struct platform_device *pdev, void *priv); void (*exit)(struct platform_device *pdev, void *priv); void *bsp_priv; + struct stmmac_axi *axi; }; #endif -- cgit v1.2.3 From 1f27cde313d72d6b44a73ba89c8b2c6a99c628cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Mar 2016 08:21:43 -0800 Subject: net: sched: use pfifo_fast for non real queues Some devices declare a high number of TX queues, then set a much lower real_num_tx_queues This cause setups using fq_codel, sfq or fq as the default qdisc to consume more memory than really needed. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sch_generic.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e5bba897d206..46e55f0202a6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -345,6 +345,12 @@ extern struct Qdisc_ops pfifo_fast_ops; extern struct Qdisc_ops mq_qdisc_ops; extern struct Qdisc_ops noqueue_qdisc_ops; extern const struct Qdisc_ops *default_qdisc_ops; +static inline const struct Qdisc_ops * +get_default_qdisc_ops(const struct net_device *dev, int ntx) +{ + return ntx < dev->real_num_tx_queues ? + default_qdisc_ops : &pfifo_fast_ops; +} struct Qdisc_class_common { u32 classid; -- cgit v1.2.3 From 0d12f8a4027d021c9cc942f09f38d28288020c5d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 4 Mar 2016 15:53:46 +0000 Subject: rxrpc: Keep the skb private record of the Rx header in host byte order Currently, a copy of the Rx packet header is copied into the the sk_buff private data so that we can advance the pointer into the buffer, potentially discarding the original. At the moment, this copy is held in network byte order, but this means we're doing a lot of unnecessary translations. The reasons it was done this way are that we need the values in network byte order occasionally and we can use the copy, slightly modified, as part of an iov array when sending an ack or an abort packet. However, it seems more reasonable on review that it would be better kept in host byte order and that we make up a new header when we want to send another packet. To this end, rename the original header struct to rxrpc_wire_header (with BE fields) and institute a variant called rxrpc_host_header that has host order fields. Change the struct in the sk_buff private data into an rxrpc_host_header and translate the values when filling it in. This further allows us to keep values kept in various structures in host byte order rather than network byte order and allows removal of some fields that are byteswapped duplicates. Signed-off-by: David Howells --- include/rxrpc/packet.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h index 4dce116bfd80..de1e67988ada 100644 --- a/include/rxrpc/packet.h +++ b/include/rxrpc/packet.h @@ -22,7 +22,7 @@ typedef __be32 rxrpc_serial_net_t; /* on-the-wire Rx message serial number */ * on-the-wire Rx packet header * - all multibyte fields should be in network byte order */ -struct rxrpc_header { +struct rxrpc_wire_header { __be32 epoch; /* client boot timestamp */ __be32 cid; /* connection and channel ID */ @@ -68,8 +68,6 @@ struct rxrpc_header { } __packed; -#define __rxrpc_header_off(X) offsetof(struct rxrpc_header,X) - extern const char *rxrpc_pkts[]; /*****************************************************************************/ -- cgit v1.2.3 From 351c1e648623b742fe1687636117306adc8b561c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 4 Mar 2016 15:56:06 +0000 Subject: rxrpc: Be more selective about the types of received packets we accept Currently, received RxRPC packets outside the range 1-13 are rejected. There are, however, holes in the range that should also be rejected - plus at least one type we don't yet support - so reject these also. Signed-off-by: David Howells --- include/rxrpc/packet.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h index de1e67988ada..9ebab3a8cf0a 100644 --- a/include/rxrpc/packet.h +++ b/include/rxrpc/packet.h @@ -70,6 +70,17 @@ struct rxrpc_wire_header { extern const char *rxrpc_pkts[]; +#define RXRPC_SUPPORTED_PACKET_TYPES ( \ + (1 << RXRPC_PACKET_TYPE_DATA) | \ + (1 << RXRPC_PACKET_TYPE_ACK) | \ + (1 << RXRPC_PACKET_TYPE_BUSY) | \ + (1 << RXRPC_PACKET_TYPE_ABORT) | \ + (1 << RXRPC_PACKET_TYPE_ACKALL) | \ + (1 << RXRPC_PACKET_TYPE_CHALLENGE) | \ + (1 << RXRPC_PACKET_TYPE_RESPONSE) | \ + /*(1 << RXRPC_PACKET_TYPE_DEBUG) | */ \ + (1 << RXRPC_PACKET_TYPE_VERSION)) + /*****************************************************************************/ /* * jumbo packet secondary header -- cgit v1.2.3 From b5d3755a22e0cc4c369c0985aef0c52c2477c1e7 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 4 Mar 2016 11:52:16 +0100 Subject: uapi: define DIV_ROUND_UP for userland DIV_ROUND_UP is defined in linux/kernel.h only for the kernel. When ethtool.h is included by a userland app, we got the following error: include/linux/ethtool.h:1218:8: error: variably modified 'queue_mask' at file scope __u32 queue_mask[DIV_ROUND_UP(MAX_NUM_QUEUE, 32)]; ^ Let's add a common definition in uapi and use it everywhere. Fixes: ac2c7ad0e5d6 ("net/ethtool: introduce a new ioctl for per queue setting") CC: Kan Liang Suggested-by: Ben Hutchings Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/kernel.h | 2 +- include/uapi/linux/ethtool.h | 3 ++- include/uapi/linux/kernel.h | 1 + include/uapi/linux/mroute6.h | 9 ++------- 4 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f31638c6e873..ac1923957236 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -64,7 +64,7 @@ #define round_down(x, y) ((x) & ~__round_mask(x, y)) #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) -#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP #define DIV_ROUND_UP_ULL(ll,d) \ ({ unsigned long long _tmp = (ll)+(d)-1; do_div(_tmp, d); _tmp; }) diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 37fd6dc33de4..9c22249ebf35 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -13,6 +13,7 @@ #ifndef _UAPI_LINUX_ETHTOOL_H #define _UAPI_LINUX_ETHTOOL_H +#include #include #include @@ -1215,7 +1216,7 @@ enum ethtool_sfeatures_retval_bits { struct ethtool_per_queue_op { __u32 cmd; __u32 sub_command; - __u32 queue_mask[DIV_ROUND_UP(MAX_NUM_QUEUE, 32)]; + __u32 queue_mask[__KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32)]; char data[]; }; diff --git a/include/uapi/linux/kernel.h b/include/uapi/linux/kernel.h index 321e399457f5..466073f0ce46 100644 --- a/include/uapi/linux/kernel.h +++ b/include/uapi/linux/kernel.h @@ -9,5 +9,6 @@ #define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) #define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) +#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) #endif /* _UAPI_LINUX_KERNEL_H */ diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h index ce91215cf7e6..5062fb5751e1 100644 --- a/include/uapi/linux/mroute6.h +++ b/include/uapi/linux/mroute6.h @@ -1,6 +1,7 @@ #ifndef _UAPI__LINUX_MROUTE6_H #define _UAPI__LINUX_MROUTE6_H +#include #include #include @@ -46,14 +47,8 @@ typedef unsigned short mifi_t; typedef __u32 if_mask; #define NIFBITS (sizeof(if_mask) * 8) /* bits per mask */ -#if !defined(__KERNEL__) -#if !defined(DIV_ROUND_UP) -#define DIV_ROUND_UP(x,y) (((x) + ((y) - 1)) / (y)) -#endif -#endif - typedef struct if_set { - if_mask ifs_bits[DIV_ROUND_UP(IF_SETSIZE, NIFBITS)]; + if_mask ifs_bits[__KERNEL_DIV_ROUND_UP(IF_SETSIZE, NIFBITS)]; } if_set; #define IF_SET(n, p) ((p)->ifs_bits[(n)/NIFBITS] |= (1 << ((n) % NIFBITS))) -- cgit v1.2.3 From 14e2037902d65213842b4e40305ff54a64abbcb6 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 4 Mar 2016 11:52:19 +0100 Subject: ethtool.h: define INT_MAX for userland INT_MAX needs limits.h in userland. When ethtool.h is included by a userland app, we got the following error: .../usr/include/linux/ethtool.h: In function 'ethtool_validate_speed': .../usr/include/linux/ethtool.h:1471:18: error: 'INT_MAX' undeclared (first use in this function) return speed <= INT_MAX || speed == SPEED_UNKNOWN ^ Fixes: e02564ee334a ("ethtool: make validate_speed accept all speeds between 0 and INT_MAX") CC: Nikolay Aleksandrov Signed-off-by: Nicolas Dichtel Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9c22249ebf35..2835b07416b7 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -17,6 +17,10 @@ #include #include +#ifndef __KERNEL__ +#include /* for INT_MAX */ +#endif + /* All structures exposed to userland should be defined such that they * have the same layout for 32-bit and 64-bit userland. */ -- cgit v1.2.3 From f719e3754ee2f7275437e61a6afd520181fdd43b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 5 Mar 2016 15:03:22 +0200 Subject: ipvs: drop first packet to redirect conntrack Jiri Bohac is reporting for a problem where the attempt to reschedule existing connection to another real server needs proper redirect for the conntrack used by the IPVS connection. For example, when IPVS connection is created to NAT-ed real server we alter the reply direction of conntrack. If we later decide to select different real server we can not alter again the conntrack. And if we expire the old connection, the new connection is left without conntrack. So, the only way to redirect both the IPVS connection and the Netfilter's conntrack is to drop the SYN packet that hits existing connection, to wait for the next jiffie to expire the old connection and its conntrack and to rely on client's retransmission to create new connection as usually. Jiri Bohac provided a fix that drops all SYNs on rescheduling, I extended his patch to do such drops only for connections that use conntrack. Here is the original report from Jiri Bohac: Since commit dc7b3eb900aa ("ipvs: Fix reuse connection if real server is dead"), new connections to dead servers are redistributed immediately to new servers. The old connection is expired using ip_vs_conn_expire_now() which sets the connection timer to expire immediately. However, before the timer callback, ip_vs_conn_expire(), is run to clean the connection's conntrack entry, the new redistributed connection may already be established and its conntrack removed instead. Fix this by dropping the first packet of the new connection instead, like we do when the destination server is not available. The timer will have deleted the old conntrack entry long before the first packet of the new connection is retransmitted. Fixes: dc7b3eb900aa ("ipvs: Fix reuse connection if real server is dead") Signed-off-by: Jiri Bohac Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 0816c872b689..a6cc576fd467 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1588,6 +1588,23 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) } #endif /* CONFIG_IP_VS_NFCT */ +/* Really using conntrack? */ +static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp, + struct sk_buff *skb) +{ +#ifdef CONFIG_IP_VS_NFCT + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + return false; + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct)) + return true; +#endif + return false; +} + static inline int ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) { -- cgit v1.2.3 From 4d7928959832ea41f7f91456b76da19cad01bd09 Mon Sep 17 00:00:00 2001 From: Hante Meuleman Date: Wed, 17 Feb 2016 11:27:07 +0100 Subject: brcmfmac: switch to new platform data Platform data is only available for sdio. With this patch a new platform data structure is being used which allows for platform data for any device and configurable per device. This patch only switches to the new structure and adds support for SDIO devices. Reviewed-by: Arend Van Spriel Reviewed-by: Franky (Zhenhui) Lin Reviewed-by: Pieter-Paul Giesberts Signed-off-by: Hante Meuleman Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- include/linux/platform_data/brcmfmac-sdio.h | 135 -------------------- include/linux/platform_data/brcmfmac.h | 185 ++++++++++++++++++++++++++++ 2 files changed, 185 insertions(+), 135 deletions(-) delete mode 100644 include/linux/platform_data/brcmfmac-sdio.h create mode 100644 include/linux/platform_data/brcmfmac.h (limited to 'include') diff --git a/include/linux/platform_data/brcmfmac-sdio.h b/include/linux/platform_data/brcmfmac-sdio.h deleted file mode 100644 index e75dcbf2b230..000000000000 --- a/include/linux/platform_data/brcmfmac-sdio.h +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2013 Broadcom Corporation - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef _LINUX_BRCMFMAC_PLATFORM_H -#define _LINUX_BRCMFMAC_PLATFORM_H - -/* - * Platform specific driver functions and data. Through the platform specific - * device data functions can be provided to help the brcmfmac driver to - * operate with the device in combination with the used platform. - * - * Use the platform data in the following (similar) way: - * - * -#include - - -static void brcmfmac_power_on(void) -{ -} - -static void brcmfmac_power_off(void) -{ -} - -static void brcmfmac_reset(void) -{ -} - -static struct brcmfmac_sdio_platform_data brcmfmac_sdio_pdata = { - .power_on = brcmfmac_power_on, - .power_off = brcmfmac_power_off, - .reset = brcmfmac_reset -}; - -static struct platform_device brcmfmac_device = { - .name = BRCMFMAC_SDIO_PDATA_NAME, - .id = PLATFORM_DEVID_NONE, - .dev.platform_data = &brcmfmac_sdio_pdata -}; - -void __init brcmfmac_init_pdata(void) -{ - brcmfmac_sdio_pdata.oob_irq_supported = true; - brcmfmac_sdio_pdata.oob_irq_nr = gpio_to_irq(GPIO_BRCMF_SDIO_OOB); - brcmfmac_sdio_pdata.oob_irq_flags = IORESOURCE_IRQ | - IORESOURCE_IRQ_HIGHLEVEL; - platform_device_register(&brcmfmac_device); -} - * - * - * Note: the brcmfmac can be loaded as module or be statically built-in into - * the kernel. If built-in then do note that it uses module_init (and - * module_exit) routines which equal device_initcall. So if you intend to - * create a module with the platform specific data for the brcmfmac and have - * it built-in to the kernel then use a higher initcall then device_initcall - * (see init.h). If this is not done then brcmfmac will load without problems - * but will not pickup the platform data. - * - * When the driver does not "detect" platform driver data then it will continue - * without reporting anything and just assume there is no data needed. Which is - * probably true for most platforms. - * - * Explanation of the platform_data fields: - * - * drive_strength: is the preferred drive_strength to be used for the SDIO - * pins. If 0 then a default value will be used. This is the target drive - * strength, the exact drive strength which will be used depends on the - * capabilities of the device. - * - * oob_irq_supported: does the board have support for OOB interrupts. SDIO - * in-band interrupts are relatively slow and for having less overhead on - * interrupt processing an out of band interrupt can be used. If the HW - * supports this then enable this by setting this field to true and configure - * the oob related fields. - * - * oob_irq_nr, oob_irq_flags: the OOB interrupt information. The values are - * used for registering the irq using request_irq function. - * - * broken_sg_support: flag for broken sg list support of SDIO host controller. - * Set this to true if the SDIO host controller has higher align requirement - * than 32 bytes for each scatterlist item. - * - * sd_head_align: alignment requirement for start of data buffer - * - * sd_sgentry_align: length alignment requirement for each sg entry - * - * power_on: This function is called by the brcmfmac when the module gets - * loaded. This can be particularly useful for low power devices. The platform - * spcific routine may for example decide to power up the complete device. - * If there is no use-case for this function then provide NULL. - * - * power_off: This function is called by the brcmfmac when the module gets - * unloaded. At this point the device can be powered down or otherwise be reset. - * So if an actual power_off is not supported but reset is then reset the device - * when this function gets called. This can be particularly useful for low power - * devices. If there is no use-case for this function (either power-down or - * reset) then provide NULL. - * - * reset: This function can get called if the device communication broke down. - * This functionality is particularly useful in case of SDIO type devices. It is - * possible to reset a dongle via sdio data interface, but it requires that - * this is fully functional. This function is chip/module specific and this - * function should return only after the complete reset has completed. - */ - -#define BRCMFMAC_SDIO_PDATA_NAME "brcmfmac_sdio" - -struct brcmfmac_sdio_platform_data { - unsigned int drive_strength; - bool oob_irq_supported; - unsigned int oob_irq_nr; - unsigned long oob_irq_flags; - bool broken_sg_support; - unsigned short sd_head_align; - unsigned short sd_sgentry_align; - void (*power_on)(void); - void (*power_off)(void); - void (*reset)(void); -}; - -#endif /* _LINUX_BRCMFMAC_PLATFORM_H */ diff --git a/include/linux/platform_data/brcmfmac.h b/include/linux/platform_data/brcmfmac.h new file mode 100644 index 000000000000..1d30bf278231 --- /dev/null +++ b/include/linux/platform_data/brcmfmac.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 201 Broadcom Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _LINUX_BRCMFMAC_PLATFORM_H +#define _LINUX_BRCMFMAC_PLATFORM_H + + +#define BRCMFMAC_PDATA_NAME "brcmfmac" + +#define BRCMFMAC_COUNTRY_BUF_SZ 4 + + +/* + * Platform specific driver functions and data. Through the platform specific + * device data functions and data can be provided to help the brcmfmac driver to + * operate with the device in combination with the used platform. + */ + + +/** + * Note: the brcmfmac can be loaded as module or be statically built-in into + * the kernel. If built-in then do note that it uses module_init (and + * module_exit) routines which equal device_initcall. So if you intend to + * create a module with the platform specific data for the brcmfmac and have + * it built-in to the kernel then use a higher initcall then device_initcall + * (see init.h). If this is not done then brcmfmac will load without problems + * but will not pickup the platform data. + * + * When the driver does not "detect" platform driver data then it will continue + * without reporting anything and just assume there is no data needed. Which is + * probably true for most platforms. + */ + +/** + * enum brcmf_bus_type - Bus type identifier. Currently SDIO, USB and PCIE are + * supported. + */ +enum brcmf_bus_type { + BRCMF_BUSTYPE_SDIO, + BRCMF_BUSTYPE_USB, + BRCMF_BUSTYPE_PCIE +}; + + +/** + * struct brcmfmac_sdio_pd - SDIO Device specific platform data. + * + * @txglomsz: SDIO txglom size. Use 0 if default of driver is to be + * used. + * @drive_strength: is the preferred drive_strength to be used for the SDIO + * pins. If 0 then a default value will be used. This is + * the target drive strength, the exact drive strength + * which will be used depends on the capabilities of the + * device. + * @oob_irq_supported: does the board have support for OOB interrupts. SDIO + * in-band interrupts are relatively slow and for having + * less overhead on interrupt processing an out of band + * interrupt can be used. If the HW supports this then + * enable this by setting this field to true and configure + * the oob related fields. + * @oob_irq_nr, + * @oob_irq_flags: the OOB interrupt information. The values are used for + * registering the irq using request_irq function. + * @broken_sg_support: flag for broken sg list support of SDIO host controller. + * Set this to true if the SDIO host controller has higher + * align requirement than 32 bytes for each scatterlist + * item. + * @sd_head_align: alignment requirement for start of data buffer. + * @sd_sgentry_align: length alignment requirement for each sg entry. + * @reset: This function can get called if the device communication + * broke down. This functionality is particularly useful in + * case of SDIO type devices. It is possible to reset a + * dongle via sdio data interface, but it requires that + * this is fully functional. This function is chip/module + * specific and this function should return only after the + * complete reset has completed. + */ +struct brcmfmac_sdio_pd { + int txglomsz; + unsigned int drive_strength; + bool oob_irq_supported; + unsigned int oob_irq_nr; + unsigned long oob_irq_flags; + bool broken_sg_support; + unsigned short sd_head_align; + unsigned short sd_sgentry_align; + void (*reset)(void); +}; + +/** + * struct brcmfmac_pd_cc_entry - Struct for translating user space country code + * (iso3166) to firmware country code and + * revision. + * + * @iso3166: iso3166 alpha 2 country code string. + * @cc: firmware country code string. + * @rev: firmware country code revision. + */ +struct brcmfmac_pd_cc_entry { + char iso3166[BRCMFMAC_COUNTRY_BUF_SZ]; + char cc[BRCMFMAC_COUNTRY_BUF_SZ]; + s32 rev; +}; + +/** + * struct brcmfmac_pd_cc - Struct for translating country codes as set by user + * space to a country code and rev which can be used by + * firmware. + * + * @table_size: number of entries in table (> 0) + * @table: array of 1 or more elements with translation information. + */ +struct brcmfmac_pd_cc { + int table_size; + struct brcmfmac_pd_cc_entry table[0]; +}; + +/** + * struct brcmfmac_pd_device - Device specific platform data. (id/rev/bus_type) + * is the unique identifier of the device. + * + * @id: ID of the device for which this data is. In case of SDIO + * or PCIE this is the chipid as identified by chip.c In + * case of USB this is the chipid as identified by the + * device query. + * @rev: chip revision, see id. + * @bus_type: The type of bus. Some chipid/rev exist for different bus + * types. Each bus type has its own set of settings. + * @feature_disable: Bitmask of features to disable (override), See feature.c + * in brcmfmac for details. + * @country_codes: If available, pointer to struct for translating country + * codes. + * @bus: Bus specific (union) device settings. Currently only + * SDIO. + */ +struct brcmfmac_pd_device { + unsigned int id; + unsigned int rev; + enum brcmf_bus_type bus_type; + unsigned int feature_disable; + struct brcmfmac_pd_cc *country_codes; + union { + struct brcmfmac_sdio_pd sdio; + } bus; +}; + +/** + * struct brcmfmac_platform_data - BRCMFMAC specific platform data. + * + * @power_on: This function is called by the brcmfmac driver when the module + * gets loaded. This can be particularly useful for low power + * devices. The platform spcific routine may for example decide to + * power up the complete device. If there is no use-case for this + * function then provide NULL. + * @power_off: This function is called by the brcmfmac when the module gets + * unloaded. At this point the devices can be powered down or + * otherwise be reset. So if an actual power_off is not supported + * but reset is supported by the devices then reset the devices + * when this function gets called. This can be particularly useful + * for low power devices. If there is no use-case for this + * function then provide NULL. + */ +struct brcmfmac_platform_data { + void (*power_on)(void); + void (*power_off)(void); + char *fw_alternative_path; + int device_count; + struct brcmfmac_pd_device devices[0]; +}; + + +#endif /* _LINUX_BRCMFMAC_PLATFORM_H */ -- cgit v1.2.3 From 2e62f9b2a41e4ade1a0bb3c1bbda4defe4c67243 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Fri, 12 Feb 2016 10:15:43 +0100 Subject: bcma: drop unneeded fields from bcma_pflash struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most of info stored in this struct wasn't really used anywhere as we put all that data in platform data & resource as well. Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/bcma/bcma_driver_chipcommon.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index 700d0c6f7480..16eaaad9dda5 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -579,9 +579,6 @@ struct bcma_chipcommon_pmu { #ifdef CONFIG_BCMA_DRIVER_MIPS struct bcma_pflash { bool present; - u8 buswidth; - u32 window; - u32 window_size; }; #ifdef CONFIG_BCMA_SFLASH -- cgit v1.2.3 From d6a3b51ada68c2bd3e184f4729ce626a1721cf74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Fri, 12 Feb 2016 10:15:44 +0100 Subject: bcma: move parallel flash support to separated file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This follows the way of handling other flashes and cleans code a bit. As next task we will want to move flash code to ChipCommon driver as: 1) Flash controllers are accesible using ChipCommon registers 2) This code isn't MIPS specific This change prepares bcma for that. Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/bcma/bcma_driver_chipcommon.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index 16eaaad9dda5..846513c73606 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -576,10 +576,11 @@ struct bcma_chipcommon_pmu { u32 crystalfreq; /* The active crystal frequency (in kHz) */ }; -#ifdef CONFIG_BCMA_DRIVER_MIPS +#ifdef CONFIG_BCMA_PFLASH struct bcma_pflash { bool present; }; +#endif #ifdef CONFIG_BCMA_SFLASH struct mtd_info; @@ -603,6 +604,7 @@ struct bcma_nflash { }; #endif +#ifdef CONFIG_BCMA_DRIVER_MIPS struct bcma_serial_port { void *regs; unsigned long clockspeed; @@ -622,8 +624,9 @@ struct bcma_drv_cc { /* Fast Powerup Delay constant */ u16 fast_pwrup_delay; struct bcma_chipcommon_pmu pmu; -#ifdef CONFIG_BCMA_DRIVER_MIPS +#ifdef CONFIG_BCMA_PFLASH struct bcma_pflash pflash; +#endif #ifdef CONFIG_BCMA_SFLASH struct bcma_sflash sflash; #endif @@ -631,6 +634,7 @@ struct bcma_drv_cc { struct bcma_nflash nflash; #endif +#ifdef CONFIG_BCMA_DRIVER_MIPS int nr_serial_ports; struct bcma_serial_port serial_ports[4]; #endif /* CONFIG_BCMA_DRIVER_MIPS */ -- cgit v1.2.3 From 088c86183012495b53ecc1c734909e5712a40b66 Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Fri, 4 Mar 2016 12:35:05 -0500 Subject: qed/qede: Add infrastructure support for hardware GRO This patch adds mainly structures and APIs prototype changes in order to give support for qede slowpath/fastpath support for the same. Signed-off-by: Yuval Mintz Signed-off-by: Manish Chopra Signed-off-by: David S. Miller --- include/linux/qed/qed_eth_if.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index e53b0ca49e41..e1d69834a11f 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -39,6 +39,14 @@ struct qed_update_vport_params { struct qed_update_vport_rss_params rss_params; }; +struct qed_start_vport_params { + bool remove_inner_vlan; + bool gro_enable; + bool drop_ttl0; + u8 vport_id; + u16 mtu; +}; + struct qed_stop_rxq_params { u8 rss_id; u8 rx_queue_id; @@ -118,9 +126,7 @@ struct qed_eth_ops { void *cookie); int (*vport_start)(struct qed_dev *cdev, - u8 vport_id, u16 mtu, - u8 drop_ttl0_flg, - u8 inner_vlan_removal_en_flg); + struct qed_start_vport_params *params); int (*vport_stop)(struct qed_dev *cdev, u8 vport_id); -- cgit v1.2.3 From 8050c0f0274a15841756968857cfb07b3ab809ae Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:02 +0100 Subject: bpf: allow bpf_csum_diff to feed bpf_l3_csum_replace as well Commit 7d672345ed29 ("bpf: add generic bpf_csum_diff helper") added a generic checksum diff helper that can feed bpf_l4_csum_replace() with a target __wsum diff that is to be applied to the L4 checksum. This facility is very flexible, can be cascaded, allows for adding, removing, or diffing data, or for calculating the pseudo header checksum from scratch, but it can also be reused for working with the IPv4 header checksum. Thus, analogous to bpf_l4_csum_replace(), add a case for header field value of 0 to change the checksum at a given offset through a new helper csum_replace_by_diff(). Also, in addition to that, this provides an easy to use interface for feeding precalculated diffs f.e. coming from a map. It nicely complements bpf_l3_csum_replace() that currently allows only for csum updates of 2 and 4 byte diffs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/checksum.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/checksum.h b/include/net/checksum.h index 10a16b5bd1c7..abffc64e7300 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -120,6 +120,11 @@ static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) #define CSUM_MANGLED_0 ((__force __sum16)0xffff) +static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) +{ + *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); +} + static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); -- cgit v1.2.3 From 8afd54c87ad7089734ef0527937a256586ba828a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:03 +0100 Subject: bpf: add flags to bpf_skb_store_bytes for clearing hash When overwriting parts of the packet with bpf_skb_store_bytes() that were fed previously into skb->hash calculation, we should clear the current hash with skb_clear_hash(), so that a next skb_get_hash() call can determine the correct hash related to this skb. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ee2193287cbe..2e3e90309904 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -305,6 +305,7 @@ enum bpf_func_id { /* BPF_FUNC_skb_store_bytes flags. */ #define BPF_F_RECOMPUTE_CSUM (1ULL << 0) +#define BPF_F_INVALIDATE_HASH (1ULL << 1) /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. * First 4 bits are for passing the header field size. -- cgit v1.2.3 From 2208087061c4ad88de188911367effc550144836 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:05 +0100 Subject: bpf: allow to propagate df in bpf_skb_set_tunnel_key Added by 9a628224a61b ("ip_tunnel: Add dont fragment flag."), allow to feed df flag into tunneling facilities (currently supported on TX by vxlan, geneve and gre) as a hint from eBPF's bpf_skb_set_tunnel_key() helper. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2e3e90309904..21ee6d52016f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -330,6 +330,7 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) +#define BPF_F_DONT_FRAGMENT (1ULL << 2) /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure -- cgit v1.2.3 From 14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:06 +0100 Subject: bpf: support for access to tunnel options After eBPF being able to programmatically access/manage tunnel key meta data via commit d3aa45ce6b94 ("bpf: add helpers to access tunnel metadata") and more recently also for IPv6 through c6c33454072f ("bpf: support ipv6 for bpf_skb_{set,get}_tunnel_key"), this work adds two complementary helpers to generically access their auxiliary tunnel options. Geneve and vxlan support this facility. For geneve, TLVs can be pushed, and for the vxlan case its GBP extension. I.e. setting tunnel key for geneve case only makes sense, if we can also read/write TLVs into it. In the GBP case, it provides the flexibility to easily map the group policy ID in combination with other helpers or maps. I chose to model this as two separate helpers, bpf_skb_{set,get}_tunnel_opt(), for a couple of reasons. bpf_skb_{set,get}_tunnel_key() is already rather complex by itself, and there may be cases for tunnel key backends where tunnel options are not always needed. If we would have integrated this into bpf_skb_{set,get}_tunnel_key() nevertheless, we are very limited with remaining helper arguments, so keeping compatibility on structs in case of passing in a flat buffer gets more cumbersome. Separating both also allows for more flexibility and future extensibility, f.e. options could be fed directly from a map, etc. Moreover, change geneve's xmit path to test only for info->options_len instead of TUNNEL_GENEVE_OPT flag. This makes it more consistent with vxlan's xmit path and allows for avoiding to specify a protocol flag in the API on xmit, so it can be protocol agnostic. Having info->options_len is enough information that is needed. Tested with vxlan and geneve. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 21ee6d52016f..9221f653fee3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -298,6 +298,17 @@ enum bpf_func_id { * Return: csum result */ BPF_FUNC_csum_diff, + + /** + * bpf_skb_[gs]et_tunnel_opt(skb, opt, size) + * retrieve or populate tunnel options metadata + * @skb: pointer to skb + * @opt: pointer to raw tunnel option data + * @size: size of @opt + * Return: 0 on success for set, option size for get + */ + BPF_FUNC_skb_get_tunnel_opt, + BPF_FUNC_skb_set_tunnel_opt, __BPF_FUNC_MAX_ID, }; -- cgit v1.2.3 From db3c6139e6ead91b42e7c2ad044ed8beaee884e6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:07 +0100 Subject: bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage when ip_tunnel_info is used is unfortunately not always valid as assumed. While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre with different remote dsts, tos, etc, therefore they cannot be assumed as packet independent. Right now vxlan, geneve, gre would cache the dst for eBPF and every packet would reuse the same entry that was first created on the initial route lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have a different one. Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test in vxlan needs to be handeled differently in this context as it is currently inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable() helper is added for the three tunnel cases, which checks if we can use dst cache. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Fixes: 468dfffcd762 ("geneve: add dst caching support") Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") Signed-off-by: Daniel Borkmann Acked-by: Paolo Abeni Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 5f28b606633e..e1395d70fb48 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -140,6 +140,7 @@ struct ip_tunnel { #define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) #define TUNNEL_GENEVE_OPT __cpu_to_be16(0x0800) #define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) +#define TUNNEL_NOCACHE __cpu_to_be16(0x2000) #define TUNNEL_OPTIONS_PRESENT (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT) @@ -206,6 +207,20 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); } +static inline bool +ip_tunnel_dst_cache_usable(const struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + if (skb->mark) + return false; + if (!info) + return true; + if (info->key.tun_flags & TUNNEL_NOCACHE) + return false; + + return true; +} + static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info) { -- cgit v1.2.3 From 9a03cd8f38efb83c13fbe62aff50eea4efff93da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Tue, 8 Mar 2016 14:44:35 +0100 Subject: ipv6: per netns fib6 walkers The IPv6 FIB data structures are separated per network namespace but there is still only one global walkers list and one global walker list lock. This means changes in one namespace unnecessarily interfere with walkers in other namespaces. Replace the global list with per-netns lists (and give each its own lock). Signed-off-by: Michal Kubecek Reviewed-by: Cong Wang Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index c0368db6df54..f0109b973648 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -58,7 +58,9 @@ struct netns_ipv6 { struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; struct fib6_table *fib6_main_tbl; + struct list_head fib6_walkers; struct dst_ops ip6_dst_ops; + rwlock_t fib6_walker_lock; unsigned int ip6_rt_gc_expire; unsigned long ip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES -- cgit v1.2.3 From 3dc94f93be161ec4203673de9a34b7362d8985b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Tue, 8 Mar 2016 14:44:45 +0100 Subject: ipv6: per netns FIB garbage collection One of our customers observed issues with FIB6 garbage collectors running in different network namespaces blocking each other, resulting in soft lockups (fib6_run_gc() initiated from timer runs always in forced mode). Now that FIB6 walkers are separated per namespace, there is no more need for instances of fib6_run_gc() in different namespaces blocking each other. There is still a call to icmp6_dst_gc() which operates on shared data but this function is protected by its own shared lock. Signed-off-by: Michal Kubecek Reviewed-by: Cong Wang Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index f0109b973648..10d0848f5b8a 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -61,6 +61,7 @@ struct netns_ipv6 { struct list_head fib6_walkers; struct dst_ops ip6_dst_ops; rwlock_t fib6_walker_lock; + spinlock_t fib6_gc_lock; unsigned int ip6_rt_gc_expire; unsigned long ip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES -- cgit v1.2.3 From b121d1e74d1f24654bdc3165d3db1ca149501356 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:13 -0800 Subject: bpf: prevent kprobe+bpf deadlocks if kprobe is placed within update or delete hash map helpers that hold bucket spin lock and triggered bpf program is trying to grab the spinlock for the same bucket on the same cpu, it will deadlock. Fix it by extending existing recursion prevention mechanism. Note, map_lookup and other tracing helpers don't have this problem, since they don't hold any locks and don't modify global data. bpf_trace_printk has its own recursive check and ok as well. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 51e498e5470e..4b070827200d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -10,6 +10,7 @@ #include #include #include +#include struct bpf_map; @@ -163,6 +164,8 @@ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *f const struct bpf_func_proto *bpf_get_trace_printk_proto(void); #ifdef CONFIG_BPF_SYSCALL +DECLARE_PER_CPU(int, bpf_prog_active); + void bpf_register_prog_type(struct bpf_prog_type_list *tl); void bpf_register_map_type(struct bpf_map_type_list *tl); -- cgit v1.2.3 From 6c90598174322b8888029e40dd84a4eb01f56afe Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:15 -0800 Subject: bpf: pre-allocate hash map elements If kprobe is placed on spin_unlock then calling kmalloc/kfree from bpf programs is not safe, since the following dead lock is possible: kfree->spin_lock(kmem_cache_node->lock)...spin_unlock->kprobe-> bpf_prog->map_update->kmalloc->spin_lock(of the same kmem_cache_node->lock) and deadlocks. The following solutions were considered and some implemented, but eventually discarded - kmem_cache_create for every map - add recursion check to slow-path of slub - use reserved memory in bpf_map_update for in_irq or in preempt_disabled - kmalloc via irq_work At the end pre-allocation of all map elements turned out to be the simplest solution and since the user is charged upfront for all the memory, such pre-allocation doesn't affect the user space visible behavior. Since it's impossible to tell whether kprobe is triggered in a safe location from kmalloc point of view, use pre-allocation by default and introduce new BPF_F_NO_PREALLOC flag. While testing of per-cpu hash maps it was discovered that alloc_percpu(GFP_ATOMIC) has odd corner cases and often fails to allocate memory even when 90% of it is free. The pre-allocation of per-cpu hash elements solves this problem as well. Turned out that bpf_map_update() quickly followed by bpf_map_lookup()+bpf_map_delete() is very common pattern used in many of iovisor/bcc/tools, so there is additional benefit of pre-allocation, since such use cases are must faster. Since all hash map elements are now pre-allocated we can remove atomic increment of htab->count and save few more cycles. Also add bpf_map_precharge_memlock() to check rlimit_memlock early to avoid large malloc/free done by users who don't have sufficient limits. Pre-allocation is done with vmalloc and alloc/free is done via percpu_freelist. Here are performance numbers for different pre-allocation algorithms that were implemented, but discarded in favor of percpu_freelist: 1 cpu: pcpu_ida 2.1M pcpu_ida nolock 2.3M bt 2.4M kmalloc 1.8M hlist+spinlock 2.3M pcpu_freelist 2.6M 4 cpu: pcpu_ida 1.5M pcpu_ida nolock 1.8M bt w/smp_align 1.7M bt no/smp_align 1.1M kmalloc 0.7M hlist+spinlock 0.2M pcpu_freelist 2.0M 8 cpu: pcpu_ida 0.7M bt w/smp_align 0.8M kmalloc 0.4M pcpu_freelist 1.5M 32 cpu: kmalloc 0.13M pcpu_freelist 0.49M pcpu_ida nolock is a modified percpu_ida algorithm without percpu_ida_cpu locks and without cross-cpu tag stealing. It's faster than existing percpu_ida, but not as fast as pcpu_freelist. bt is a variant of block/blk-mq-tag.c simlified and customized for bpf use case. bt w/smp_align is using cache line for every 'long' (similar to blk-mq-tag). bt no/smp_align allocates 'long' bitmasks continuously to save memory. It's comparable to percpu_ida and in some cases faster, but slower than percpu_freelist hlist+spinlock is the simplest free list with single spinlock. As expeceted it has very bad scaling in SMP. kmalloc is existing implementation which is still available via BPF_F_NO_PREALLOC flag. It's significantly slower in single cpu and in 8 cpu setup it's 3 times slower than pre-allocation with pcpu_freelist, but saves memory, so in cases where map->max_entries can be large and number of map update/delete per second is low, it may make sense to use it. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4b070827200d..efd1d4ca95c6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -37,6 +37,7 @@ struct bpf_map { u32 key_size; u32 value_size; u32 max_entries; + u32 map_flags; u32 pages; struct user_struct *user; const struct bpf_map_ops *ops; @@ -178,6 +179,7 @@ struct bpf_map *__bpf_map_get(struct fd f); void bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); +int bpf_map_precharge_memlock(u32 pages); extern int sysctl_unprivileged_bpf_disabled; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9221f653fee3..0e30b19012a5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -101,12 +101,15 @@ enum bpf_prog_type { #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ +#define BPF_F_NO_PREALLOC (1U << 0) + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ + __u32 map_flags; /* prealloc or not */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ -- cgit v1.2.3 From 557c0c6e7df8e14a46bd7560d193fa5bbc00a858 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Mar 2016 21:57:17 -0800 Subject: bpf: convert stackmap to pre-allocation It was observed that calling bpf_get_stackid() from a kprobe inside slub or from spin_unlock causes similar deadlock as with hashmap, therefore convert stackmap to use pre-allocated memory. The call_rcu is no longer feasible mechanism, since delayed freeing causes bpf_get_stackid() to fail unpredictably when number of actual stacks is significantly less than user requested max_entries. Since elements are no longer freed into slub, we can push elements into freelist immediately and let them be recycled. However the very unlikley race between user space map_lookup() and program-side recycling is possible: cpu0 cpu1 ---- ---- user does lookup(stackidX) starts copying ips into buffer delete(stackidX) calls bpf_get_stackid() which recyles the element and overwrites with new stack trace To avoid user space seeing a partial stack trace consisting of two merged stack traces, do bucket = xchg(, NULL); copy; xchg(,bucket); to preserve consistent stack trace delivery to user space. Now we can move memset(,0) of left-over element value from critical path of bpf_get_stackid() into slow-path of user space lookup. Also disallow lookup() from bpf program, since it's useless and program shouldn't be messing with collected stack trace. Note that similar race between user space lookup and kernel side updates is also present in hashmap, but it's not a new race. bpf programs were always allowed to modify hash and array map elements while user space is copying them. Fixes: d5a3b1f69186 ("bpf: introduce BPF_MAP_TYPE_STACK_TRACE") Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index efd1d4ca95c6..21ee41b92e8a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -195,6 +195,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 flags); int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 flags); +int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. -- cgit v1.2.3 From e28e87ed474c5a0b378c66fb85efc8e487f4f63f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 8 Mar 2016 23:36:03 +0100 Subject: ip_tunnel, bpf: ip_tunnel_info_opts_{get, set} depends on CONFIG_INET Helpers like ip_tunnel_info_opts_{get,set}() are only available if CONFIG_INET is set, thus add an empty definition into the header for the !CONFIG_INET case, where already other empty inline helpers are defined. This avoids ifdef kludge inside filter.c, but also vxlan and geneve themself where this facility can only be used with, depend on INET being set. For the !INET case TUNNEL_OPTIONS_PRESENT would never be set in flags. Fixes: 14ca0751c96f ("bpf: support for access to tunnel options") Reported-by: Fengguang Wu Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index e1395d70fb48..0acd80fadb32 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -369,6 +369,17 @@ static inline void ip_tunnel_unneed_metadata(void) { } +static inline void ip_tunnel_info_opts_get(void *to, + const struct ip_tunnel_info *info) +{ +} + +static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, + const void *from, int len) +{ + info->options_len = 0; +} + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ -- cgit v1.2.3 From ff3c44e675054533403909ecb76e78c1d4efbd26 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:00 -0800 Subject: rcu: Add list_next_or_null_rcu This is a convenience function that returns the next entry in an RCU list or NULL if at the end of the list. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/rculist.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 14ec1652daf4..17d4f849c65e 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -318,6 +318,27 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \ }) +/** + * list_next_or_null_rcu - get the first element from a list + * @head: the head for the list. + * @ptr: the list head to take the next element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * Note that if the ptr is at the end of the list, NULL is returned. + * + * This primitive may safely run concurrently with the _rcu list-mutation + * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). + */ +#define list_next_or_null_rcu(head, ptr, type, member) \ +({ \ + struct list_head *__head = (head); \ + struct list_head *__ptr = (ptr); \ + struct list_head *__next = READ_ONCE(__ptr->next); \ + likely(__next != __head) ? list_entry_rcu(__next, type, \ + member) : NULL; \ +}) + /** * list_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. -- cgit v1.2.3 From f4a00aacdb5f6784d46e8c999b6bb52ece4b306b Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:01 -0800 Subject: net: Make sock_alloc exportable Export it for cases where we want to create sockets by hand. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/net.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index 0b4ac7da583a..49175e4ced11 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -215,6 +215,7 @@ int __sock_create(struct net *net, int family, int type, int proto, int sock_create(int family, int type, int proto, struct socket **res); int sock_create_kern(struct net *net, int family, int type, int proto, struct socket **res); int sock_create_lite(int family, int type, int proto, struct socket **res); +struct socket *sock_alloc(void); void sock_release(struct socket *sock); int sock_sendmsg(struct socket *sock, struct msghdr *msg); int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, -- cgit v1.2.3 From f092276d85b82504e8a07498f4e9e0c51f06745c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:03 -0800 Subject: net: Add MSG_BATCH flag Add a new msg flag called MSG_BATCH. This flag is used in sendmsg to indicate that more messages will follow (i.e. a batch of messages is being sent). This is similar to MSG_MORE except that the following messages are not merged into one packet, they are sent individually. sendmmsg is updated so that each contained message except for the last one is marked as MSG_BATCH. MSG_BATCH is a performance optimization in cases where a socket implementation can benefit by transmitting packets in a batch. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/socket.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 5bf59c8493b7..d834af22a460 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -274,6 +274,7 @@ struct ucred { #define MSG_MORE 0x8000 /* Sender will send more */ #define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ +#define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ -- cgit v1.2.3 From 473bd239b808a8af5241ce9996a16d283d88ddff Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:05 -0800 Subject: tcp: Add tcp_inq to get available receive bytes on socket Create a common kernel function to get the number of bytes available on a TCP socket. This is based on code in INQ getsockopt and we now call the function for that getsockopt. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/tcp.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index e90db8546806..0302636af98c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1816,4 +1816,28 @@ static inline void skb_set_tcp_pure_ack(struct sk_buff *skb) skb->truesize = 2; } +static inline int tcp_inq(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int answ; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + answ = 0; + } else if (sock_flag(sk, SOCK_URGINLINE) || + !tp->urg_data || + before(tp->urg_seq, tp->copied_seq) || + !before(tp->urg_seq, tp->rcv_nxt)) { + + answ = tp->rcv_nxt - tp->copied_seq; + + /* Subtract 1, if FIN was received */ + if (answ && sock_flag(sk, SOCK_DONE)) + answ--; + } else { + answ = tp->urg_seq - tp->copied_seq; + } + + return answ; +} + #endif /* _TCP_H */ -- cgit v1.2.3 From ab7ac4eb9832e32a09f4e8042705484d2fb0aad3 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:06 -0800 Subject: kcm: Kernel Connection Multiplexor module This module implements the Kernel Connection Multiplexor. Kernel Connection Multiplexor (KCM) is a facility that provides a message based interface over TCP for generic application protocols. With KCM an application can efficiently send and receive application protocol messages over TCP using datagram sockets. For more information see the included Documentation/networking/kcm.txt Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/socket.h | 6 ++- include/net/kcm.h | 125 +++++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/kcm.h | 40 +++++++++++++++ 3 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 include/net/kcm.h create mode 100644 include/uapi/linux/kcm.h (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index d834af22a460..73bf6c6a833b 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -200,7 +200,9 @@ struct ucred { #define AF_ALG 38 /* Algorithm sockets */ #define AF_NFC 39 /* NFC sockets */ #define AF_VSOCK 40 /* vSockets */ -#define AF_MAX 41 /* For now.. */ +#define AF_KCM 41 /* Kernel Connection Multiplexor*/ + +#define AF_MAX 42 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC @@ -246,6 +248,7 @@ struct ucred { #define PF_ALG AF_ALG #define PF_NFC AF_NFC #define PF_VSOCK AF_VSOCK +#define PF_KCM AF_KCM #define PF_MAX AF_MAX /* Maximum queue length specifiable by listen. */ @@ -323,6 +326,7 @@ struct ucred { #define SOL_CAIF 278 #define SOL_ALG 279 #define SOL_NFC 280 +#define SOL_KCM 281 /* IPX options */ #define IPX_TYPE 1 diff --git a/include/net/kcm.h b/include/net/kcm.h new file mode 100644 index 000000000000..1bcae39070ec --- /dev/null +++ b/include/net/kcm.h @@ -0,0 +1,125 @@ +/* + * Kernel Connection Multiplexor + * + * Copyright (c) 2016 Tom Herbert + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + +#ifndef __NET_KCM_H_ +#define __NET_KCM_H_ + +#include +#include +#include + +extern unsigned int kcm_net_id; + +struct kcm_tx_msg { + unsigned int sent; + unsigned int fragidx; + unsigned int frag_offset; + unsigned int msg_flags; + struct sk_buff *frag_skb; + struct sk_buff *last_skb; +}; + +struct kcm_rx_msg { + int full_len; + int accum_len; + int offset; +}; + +/* Socket structure for KCM client sockets */ +struct kcm_sock { + struct sock sk; + struct kcm_mux *mux; + struct list_head kcm_sock_list; + int index; + u32 done : 1; + struct work_struct done_work; + + /* Transmit */ + struct kcm_psock *tx_psock; + struct work_struct tx_work; + struct list_head wait_psock_list; + struct sk_buff *seq_skb; + + /* Don't use bit fields here, these are set under different locks */ + bool tx_wait; + bool tx_wait_more; + + /* Receive */ + struct kcm_psock *rx_psock; + struct list_head wait_rx_list; /* KCMs waiting for receiving */ + bool rx_wait; + u32 rx_disabled : 1; +}; + +struct bpf_prog; + +/* Structure for an attached lower socket */ +struct kcm_psock { + struct sock *sk; + struct kcm_mux *mux; + int index; + + u32 tx_stopped : 1; + u32 rx_stopped : 1; + u32 done : 1; + u32 unattaching : 1; + + void (*save_state_change)(struct sock *sk); + void (*save_data_ready)(struct sock *sk); + void (*save_write_space)(struct sock *sk); + + struct list_head psock_list; + + /* Receive */ + struct sk_buff *rx_skb_head; + struct sk_buff **rx_skb_nextp; + struct sk_buff *ready_rx_msg; + struct list_head psock_ready_list; + struct work_struct rx_work; + struct delayed_work rx_delayed_work; + struct bpf_prog *bpf_prog; + struct kcm_sock *rx_kcm; + + /* Transmit */ + struct kcm_sock *tx_kcm; + struct list_head psock_avail_list; +}; + +/* Per net MUX list */ +struct kcm_net { + struct mutex mutex; + struct list_head mux_list; + int count; +}; + +/* Structure for a MUX */ +struct kcm_mux { + struct list_head kcm_mux_list; + struct rcu_head rcu; + struct kcm_net *knet; + + struct list_head kcm_socks; /* All KCM sockets on MUX */ + int kcm_socks_cnt; /* Total KCM socket count for MUX */ + struct list_head psocks; /* List of all psocks on MUX */ + int psocks_cnt; /* Total attached sockets */ + + /* Receive */ + spinlock_t rx_lock ____cacheline_aligned_in_smp; + struct list_head kcm_rx_waiters; /* KCMs waiting for receiving */ + struct list_head psocks_ready; /* List of psocks with a msg ready */ + struct sk_buff_head rx_hold_queue; + + /* Transmit */ + spinlock_t lock ____cacheline_aligned_in_smp; /* TX and mux locking */ + struct list_head psocks_avail; /* List of available psocks */ + struct list_head kcm_tx_waiters; /* KCMs waiting for a TX psock */ +}; + +#endif /* __NET_KCM_H_ */ diff --git a/include/uapi/linux/kcm.h b/include/uapi/linux/kcm.h new file mode 100644 index 000000000000..a5a530940b99 --- /dev/null +++ b/include/uapi/linux/kcm.h @@ -0,0 +1,40 @@ +/* + * Kernel Connection Multiplexor + * + * Copyright (c) 2016 Tom Herbert + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * User API to clone KCM sockets and attach transport socket to a KCM + * multiplexor. + */ + +#ifndef KCM_KERNEL_H +#define KCM_KERNEL_H + +struct kcm_attach { + int fd; + int bpf_fd; +}; + +struct kcm_unattach { + int fd; +}; + +struct kcm_clone { + int fd; +}; + +#define SIOCKCMATTACH (SIOCPROTOPRIVATE + 0) +#define SIOCKCMUNATTACH (SIOCPROTOPRIVATE + 1) +#define SIOCKCMCLONE (SIOCPROTOPRIVATE + 2) + +#define KCMPROTO_CONNECTED 0 + +/* Socket options */ +#define KCM_RECV_DISABLE 1 + +#endif + -- cgit v1.2.3 From cd6e111bf5be5c70aef96a86d791ee7be0c0e137 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:07 -0800 Subject: kcm: Add statistics and proc interfaces This patch adds various counters for KCM. These include counters for messages and bytes received or sent, as well as counters for number of attached/unattached TCP sockets and other error or edge events. The statistics are exposed via a proc interface. /proc/net/kcm provides statistics per KCM socket and per psock (attached TCP sockets). /proc/net/kcm_stats provides aggregate statistics. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/kcm.h | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) (limited to 'include') diff --git a/include/net/kcm.h b/include/net/kcm.h index 1bcae39070ec..39c7abe98552 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -17,6 +17,42 @@ extern unsigned int kcm_net_id; +#define KCM_STATS_ADD(stat, count) ((stat) += (count)) +#define KCM_STATS_INCR(stat) ((stat)++) + +struct kcm_psock_stats { + unsigned long long rx_msgs; + unsigned long long rx_bytes; + unsigned long long tx_msgs; + unsigned long long tx_bytes; + unsigned int rx_aborts; + unsigned int rx_mem_fail; + unsigned int rx_need_more_hdr; + unsigned int rx_bad_hdr_len; + unsigned long long reserved; + unsigned long long unreserved; + unsigned int tx_aborts; +}; + +struct kcm_mux_stats { + unsigned long long rx_msgs; + unsigned long long rx_bytes; + unsigned long long tx_msgs; + unsigned long long tx_bytes; + unsigned int rx_ready_drops; + unsigned int tx_retries; + unsigned int psock_attach; + unsigned int psock_unattach_rsvd; + unsigned int psock_unattach; +}; + +struct kcm_stats { + unsigned long long rx_msgs; + unsigned long long rx_bytes; + unsigned long long tx_msgs; + unsigned long long tx_bytes; +}; + struct kcm_tx_msg { unsigned int sent; unsigned int fragidx; @@ -41,6 +77,8 @@ struct kcm_sock { u32 done : 1; struct work_struct done_work; + struct kcm_stats stats; + /* Transmit */ struct kcm_psock *tx_psock; struct work_struct tx_work; @@ -77,6 +115,8 @@ struct kcm_psock { struct list_head psock_list; + struct kcm_psock_stats stats; + /* Receive */ struct sk_buff *rx_skb_head; struct sk_buff **rx_skb_nextp; @@ -86,15 +126,21 @@ struct kcm_psock { struct delayed_work rx_delayed_work; struct bpf_prog *bpf_prog; struct kcm_sock *rx_kcm; + unsigned long long saved_rx_bytes; + unsigned long long saved_rx_msgs; /* Transmit */ struct kcm_sock *tx_kcm; struct list_head psock_avail_list; + unsigned long long saved_tx_bytes; + unsigned long long saved_tx_msgs; }; /* Per net MUX list */ struct kcm_net { struct mutex mutex; + struct kcm_psock_stats aggregate_psock_stats; + struct kcm_mux_stats aggregate_mux_stats; struct list_head mux_list; int count; }; @@ -110,6 +156,9 @@ struct kcm_mux { struct list_head psocks; /* List of all psocks on MUX */ int psocks_cnt; /* Total attached sockets */ + struct kcm_mux_stats stats; + struct kcm_psock_stats aggregate_psock_stats; + /* Receive */ spinlock_t rx_lock ____cacheline_aligned_in_smp; struct list_head kcm_rx_waiters; /* KCMs waiting for receiving */ @@ -122,4 +171,49 @@ struct kcm_mux { struct list_head kcm_tx_waiters; /* KCMs waiting for a TX psock */ }; +#ifdef CONFIG_PROC_FS +int kcm_proc_init(void); +void kcm_proc_exit(void); +#else +static int kcm_proc_init(void) { return 0; } +static void kcm_proc_exit(void) { } +#endif + +static inline void aggregate_psock_stats(struct kcm_psock_stats *stats, + struct kcm_psock_stats *agg_stats) +{ + /* Save psock statistics in the mux when psock is being unattached. */ + +#define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += stats->_stat) + SAVE_PSOCK_STATS(rx_msgs); + SAVE_PSOCK_STATS(rx_bytes); + SAVE_PSOCK_STATS(rx_aborts); + SAVE_PSOCK_STATS(rx_mem_fail); + SAVE_PSOCK_STATS(rx_need_more_hdr); + SAVE_PSOCK_STATS(rx_bad_hdr_len); + SAVE_PSOCK_STATS(tx_msgs); + SAVE_PSOCK_STATS(tx_bytes); + SAVE_PSOCK_STATS(reserved); + SAVE_PSOCK_STATS(unreserved); + SAVE_PSOCK_STATS(tx_aborts); +#undef SAVE_PSOCK_STATS +} + +static inline void aggregate_mux_stats(struct kcm_mux_stats *stats, + struct kcm_mux_stats *agg_stats) +{ + /* Save psock statistics in the mux when psock is being unattached. */ + +#define SAVE_MUX_STATS(_stat) (agg_stats->_stat += stats->_stat) + SAVE_MUX_STATS(rx_msgs); + SAVE_MUX_STATS(rx_bytes); + SAVE_MUX_STATS(tx_msgs); + SAVE_MUX_STATS(tx_bytes); + SAVE_MUX_STATS(rx_ready_drops); + SAVE_MUX_STATS(psock_attach); + SAVE_MUX_STATS(psock_unattach_rsvd); + SAVE_MUX_STATS(psock_unattach); +#undef SAVE_MUX_STATS +} + #endif /* __NET_KCM_H_ */ -- cgit v1.2.3 From 7ced95ef525c329f947c424859cf2b0a3b731f8c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:10 -0800 Subject: kcm: Add memory limit for receive message construction Message assembly is performed on the TCP socket. This is logically equivalent of an application that performs a peek on the socket to find out how much memory is needed for a receive buffer. The receive socket buffer also provides the maximum message size which is checked. The receive algorithm is something like: 1) Receive the first skbuf for a message (or skbufs if multiple are needed to determine message length). 2) Check the message length against the number of bytes in the TCP receive queue (tcp_inq()). - If all the bytes of the message are in the queue (incluing the skbuf received), then proceed with message assembly (it should complete with the tcp_read_sock) - Else, mark the psock with the number of bytes needed to complete the message. 3) In TCP data ready function, if the psock indicates that we are waiting for the rest of the bytes of a messages, check the number of queued bytes against that. - If there are still not enough bytes for the message, just return - Else, clear the waiting bytes and proceed to receive the skbufs. The message should now be received in one tcp_read_sock Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/kcm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/kcm.h b/include/net/kcm.h index 39c7abe98552..d892956ff552 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -28,6 +28,7 @@ struct kcm_psock_stats { unsigned int rx_aborts; unsigned int rx_mem_fail; unsigned int rx_need_more_hdr; + unsigned int rx_msg_too_big; unsigned int rx_bad_hdr_len; unsigned long long reserved; unsigned long long unreserved; @@ -66,6 +67,7 @@ struct kcm_rx_msg { int full_len; int accum_len; int offset; + int early_eaten; }; /* Socket structure for KCM client sockets */ @@ -128,6 +130,7 @@ struct kcm_psock { struct kcm_sock *rx_kcm; unsigned long long saved_rx_bytes; unsigned long long saved_rx_msgs; + unsigned int rx_need_bytes; /* Transmit */ struct kcm_sock *tx_kcm; @@ -190,6 +193,7 @@ static inline void aggregate_psock_stats(struct kcm_psock_stats *stats, SAVE_PSOCK_STATS(rx_aborts); SAVE_PSOCK_STATS(rx_mem_fail); SAVE_PSOCK_STATS(rx_need_more_hdr); + SAVE_PSOCK_STATS(rx_msg_too_big); SAVE_PSOCK_STATS(rx_bad_hdr_len); SAVE_PSOCK_STATS(tx_msgs); SAVE_PSOCK_STATS(tx_bytes); -- cgit v1.2.3 From 29152a34f72cb4d7ab32885ad2f20a482c92a8f3 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:11 -0800 Subject: kcm: Add receive message timeout This patch adds receive timeout for message assembly on the attached TCP sockets. The timeout is set when a new messages is started and the whole message has not been received by TCP (not in the receive queue). If the completely message is subsequently received the timer is cancelled, if the timer expires the RX side is aborted. The timeout value is taken from the socket timeout (SO_RCVTIMEO) that is set on a TCP socket (i.e. set by get sockopt before attaching a TCP socket to KCM. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/kcm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/kcm.h b/include/net/kcm.h index d892956ff552..95c425ca97b6 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -29,6 +29,7 @@ struct kcm_psock_stats { unsigned int rx_mem_fail; unsigned int rx_need_more_hdr; unsigned int rx_msg_too_big; + unsigned int rx_msg_timeouts; unsigned int rx_bad_hdr_len; unsigned long long reserved; unsigned long long unreserved; @@ -130,6 +131,7 @@ struct kcm_psock { struct kcm_sock *rx_kcm; unsigned long long saved_rx_bytes; unsigned long long saved_rx_msgs; + struct timer_list rx_msg_timer; unsigned int rx_need_bytes; /* Transmit */ @@ -194,6 +196,7 @@ static inline void aggregate_psock_stats(struct kcm_psock_stats *stats, SAVE_PSOCK_STATS(rx_mem_fail); SAVE_PSOCK_STATS(rx_need_more_hdr); SAVE_PSOCK_STATS(rx_msg_too_big); + SAVE_PSOCK_STATS(rx_msg_timeouts); SAVE_PSOCK_STATS(rx_bad_hdr_len); SAVE_PSOCK_STATS(tx_msgs); SAVE_PSOCK_STATS(tx_bytes); -- cgit v1.2.3 From 87aca73737e379f079993802d2c43606f7c5d26c Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 21 Jan 2016 09:20:12 +0100 Subject: NFC: microread: Drop platform data header file Originally I only wanted to drop the unneeded inclusion of , but then noticed that struct microread_nfc_platform_data isn't actually used, and MICROREAD_DRIVER_NAME is redefined in the only file where it is used, so we can get rid of the header file and dead code altogether. Signed-off-by: Jean Delvare Cc: Lauro Ramos Venancio Cc: Aloisio Almeida Jr Signed-off-by: Samuel Ortiz --- include/linux/platform_data/microread.h | 35 --------------------------------- 1 file changed, 35 deletions(-) delete mode 100644 include/linux/platform_data/microread.h (limited to 'include') diff --git a/include/linux/platform_data/microread.h b/include/linux/platform_data/microread.h deleted file mode 100644 index ca13992089b8..000000000000 --- a/include/linux/platform_data/microread.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Driver include for the Inside Secure microread NFC Chip. - * - * Copyright (C) 2011 Tieto Poland - * Copyright (C) 2012 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#ifndef _MICROREAD_H -#define _MICROREAD_H - -#include - -#define MICROREAD_DRIVER_NAME "microread" - -/* board config platform data for microread */ -struct microread_nfc_platform_data { - unsigned int rst_gpio; - unsigned int irq_gpio; - unsigned int ioh_gpio; -}; - -#endif /* _MICROREAD_H */ -- cgit v1.2.3 From 2793a23aacbd754dbbb5cb75093deb7e4103bace Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 9 Mar 2016 21:58:32 -0500 Subject: net: validate variable length ll headers Netdevice parameter hard_header_len is variously interpreted both as an upper and lower bound on link layer header length. The field is used as upper bound when reserving room at allocation, as lower bound when validating user input in PF_PACKET. Clarify the definition to be maximum header length. For validation of untrusted headers, add an optional validate member to header_ops. Allow bypassing of validation by passing CAP_SYS_RAWIO, for instance for deliberate testing of corrupt input. In this case, pad trailing bytes, as some device drivers expect completely initialized headers. See also http://comments.gmane.org/gmane.linux.network/401064 Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/netdevice.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index efe7cec111fa..fd30cb545c45 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -268,6 +268,7 @@ struct header_ops { void (*cache_update)(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); + bool (*validate)(const char *ll_header, unsigned int len); }; /* These flag bits are private to the generic network queueing @@ -1459,8 +1460,7 @@ enum netdev_priv_flags { * @dma: DMA channel * @mtu: Interface MTU value * @type: Interface hardware type - * @hard_header_len: Hardware header length, which means that this is the - * minimum size of a packet. + * @hard_header_len: Maximum hardware header length. * * @needed_headroom: Extra headroom the hardware may need, but not in all * cases can this be guaranteed @@ -2687,6 +2687,24 @@ static inline int dev_parse_header(const struct sk_buff *skb, return dev->header_ops->parse(skb, haddr); } +/* ll_header must have at least hard_header_len allocated */ +static inline bool dev_validate_header(const struct net_device *dev, + char *ll_header, int len) +{ + if (likely(len >= dev->hard_header_len)) + return true; + + if (capable(CAP_SYS_RAWIO)) { + memset(ll_header + len, 0, dev->hard_header_len - len); + return true; + } + + if (dev->header_ops && dev->header_ops->validate) + return dev->header_ops->validate(ll_header, len); + + return false; +} + typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr, int len); int register_gifconf(unsigned int family, gifconf_func_t *gifconf); static inline int unregister_gifconf(unsigned int family) -- cgit v1.2.3 From f16089209e1029d45ae78dd238b6ab9b2c9a886c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 4 Mar 2016 10:10:20 +0100 Subject: mac802154: use put and get unaligned functions This patch removes the swap pointer and memmove functionality. Instead we use the well known put/get unaligned access with specific byte order handling. Signed-off-by: Alexander Aring Suggested-by: Marc Kleine-Budde Signed-off-by: Marcel Holtmann --- include/net/mac802154.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index 2e3cdd2048d2..6cd7a70706a9 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -16,10 +16,10 @@ #ifndef NET_MAC802154_H #define NET_MAC802154_H +#include #include #include #include -#include #include @@ -254,7 +254,7 @@ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb) return cpu_to_le16(0); } - return (__force __le16)__get_unaligned_memmove16(skb_mac_header(skb)); + return get_unaligned_le16(skb_mac_header(skb)); } /** @@ -264,7 +264,7 @@ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb) */ static inline void ieee802154_be64_to_le64(void *le64_dst, const void *be64_src) { - __put_unaligned_memmove64(swab64p(be64_src), le64_dst); + put_unaligned_le64(get_unaligned_be64(be64_src), le64_dst); } /** @@ -274,7 +274,7 @@ static inline void ieee802154_be64_to_le64(void *le64_dst, const void *be64_src) */ static inline void ieee802154_le64_to_be64(void *be64_dst, const void *le64_src) { - __put_unaligned_memmove64(swab64p(le64_src), be64_dst); + put_unaligned_be64(get_unaligned_le64(le64_src), be64_dst); } /** @@ -284,7 +284,7 @@ static inline void ieee802154_le64_to_be64(void *be64_dst, const void *le64_src) */ static inline void ieee802154_le16_to_be16(void *be16_dst, const void *le16_src) { - __put_unaligned_memmove16(swab16p(le16_src), be16_dst); + put_unaligned_be16(get_unaligned_le16(le16_src), be16_dst); } /** -- cgit v1.2.3 From 82a37adeedd38880940e2772ec1ae27a09353e5a Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 9 Mar 2016 17:30:34 +0200 Subject: Bluetooth: Add support for limited privacy mode Introduce a limited privacy mode indicated by value 0x02 to the mgmt Set Privacy command. With value 0x02 the kernel will use privacy mode with a resolvable private address. In case the controller is bondable and discoverable the identity address will be used. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 339ea57be423..5d38d980b89d 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -233,6 +233,7 @@ enum { HCI_SC_ENABLED, HCI_SC_ONLY, HCI_PRIVACY, + HCI_LIMITED_PRIVACY, HCI_RPA_EXPIRED, HCI_RPA_RESOLVING, HCI_HS_ENABLED, -- cgit v1.2.3 From f720d0caa0af2c33ad15310974c7320345ab4468 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 10 Mar 2016 19:31:12 +0100 Subject: kcm: mark helper functions inline The stub helper functions for the newly added kcm_proc_init/exit interfaces are defined as 'static' in a header file, which leads to build warnings for each file that includes them without calling them: include/net/kcm.h:183:12: error: 'kcm_proc_init' defined but not used [-Werror=unused-function] include/net/kcm.h:184:13: error: 'kcm_proc_exit' defined but not used [-Werror=unused-function] This marks the two functions as 'static inline' instead, which avoids the warnings and is obviously what was meant here. Signed-off-by: Arnd Bergmann Fixes: cd6e111bf5be ("kcm: Add statistics and proc interfaces") Signed-off-by: David S. Miller --- include/net/kcm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/kcm.h b/include/net/kcm.h index 95c425ca97b6..2840b5825dcc 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -180,8 +180,8 @@ struct kcm_mux { int kcm_proc_init(void); void kcm_proc_exit(void); #else -static int kcm_proc_init(void) { return 0; } -static void kcm_proc_exit(void) { } +static inline int kcm_proc_init(void) { return 0; } +static inline void kcm_proc_exit(void) { } #endif static inline void aggregate_psock_stats(struct kcm_psock_stats *stats, -- cgit v1.2.3 From 5b33f48842fa1e13e9c0ea8cc59c1d0df19042db Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Tue, 8 Mar 2016 12:42:29 +0200 Subject: net/flower: Introduce hardware offload support This patch is based on a patch made by John Fastabend. It adds support for offloading cls_flower. when NETIF_F_HW_TC is on: flags = 0 => Rule will be processed twice - by hardware, and if still relevant, by software. flags = SKIP_HW => Rull will be processed by software only If hardware fail/not capabale to apply the rule, operation will NOT fail. Filter will be processed by SW only. Acked-by: Jiri Pirko Suggested-by: John Fastabend Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ include/net/pkt_cls.h | 14 ++++++++++++++ include/uapi/linux/pkt_cls.h | 2 ++ 3 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fd30cb545c45..41df0b450757 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -786,6 +786,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, enum { TC_SETUP_MQPRIO, TC_SETUP_CLSU32, + TC_SETUP_CLSFLOWER, }; struct tc_cls_u32_offload; @@ -795,6 +796,7 @@ struct tc_to_netdev { union { u8 tc; struct tc_cls_u32_offload *cls_u32; + struct tc_cls_flower_offload *cls_flower; }; }; diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index bea14eee373e..5b4e8f08b8f0 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -409,4 +409,18 @@ static inline bool tc_should_offload(struct net_device *dev, u32 flags) return true; } +enum tc_fl_command { + TC_CLSFLOWER_REPLACE, + TC_CLSFLOWER_DESTROY, +}; + +struct tc_cls_flower_offload { + enum tc_fl_command command; + u64 cookie; + struct flow_dissector *dissector; + struct fl_flow_key *mask; + struct fl_flow_key *key; + struct tcf_exts *exts; +}; + #endif diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 9874f5680926..c43c5f78b9c4 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -417,6 +417,8 @@ enum { TCA_FLOWER_KEY_TCP_DST, /* be16 */ TCA_FLOWER_KEY_UDP_SRC, /* be16 */ TCA_FLOWER_KEY_UDP_DST, /* be16 */ + + TCA_FLOWER_FLAGS, __TCA_FLOWER_MAX, }; -- cgit v1.2.3 From 8de2d793daf784f8f109565bcc023a6d198bad85 Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Tue, 8 Mar 2016 12:42:30 +0200 Subject: net/flow_dissector: Make dissector_uses_key() and skb_flow_dissector_target() public Will be used in a following patch to query if a key is being used, and what it's value in the target object. Acked-by: John Fastabend Acked-by: Jiri Pirko Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 8c8548cf5888..d3d60dccd19f 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -184,4 +184,17 @@ static inline bool flow_keys_have_l4(struct flow_keys *keys) u32 flow_hash_from_keys(struct flow_keys *keys); +static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) +{ + return flow_dissector->used_keys & (1 << key_id); +} + +static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id, + void *target_container) +{ + return ((char *)target_container) + flow_dissector->offset[key_id]; +} + #endif -- cgit v1.2.3 From 00175aec941e9c306d8a5ce930b2d91f7c04468c Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Tue, 8 Mar 2016 12:42:31 +0200 Subject: net/sched: Macro instead of CONFIG_NET_CLS_ACT ifdef Introduce the macros tc_no_actions and tc_for_each_action to make code clearer. Extracted struct tc_action out of the ifdef to make calls to is_tcf_gact_shot() and similar functions valid, even when it is a nop. Acked-by: Jiri Pirko Acked-by: John Fastabend Suggested-by: Jiri Pirko Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/net/act_api.h | 21 ++++++++++++++++----- include/net/tc_act/tc_gact.h | 4 ++-- 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 342be6c5ab5c..2a19fe111c78 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -78,11 +78,6 @@ static inline void tcf_lastuse_update(struct tcf_t *tm) tm->lastuse = now; } -#ifdef CONFIG_NET_CLS_ACT - -#define ACT_P_CREATED 1 -#define ACT_P_DELETED 1 - struct tc_action { void *priv; const struct tc_action_ops *ops; @@ -92,6 +87,11 @@ struct tc_action { struct tcf_hashinfo *hinfo; }; +#ifdef CONFIG_NET_CLS_ACT + +#define ACT_P_CREATED 1 +#define ACT_P_DELETED 1 + struct tc_action_ops { struct list_head head; char kind[IFNAMSIZ]; @@ -171,5 +171,16 @@ int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); + +#define tc_no_actions(_exts) \ + (list_empty(&(_exts)->actions)) + +#define tc_for_each_action(_a, _exts) \ + list_for_each_entry(a, &(_exts)->actions, list) +#else /* CONFIG_NET_CLS_ACT */ + +#define tc_no_actions(_exts) true +#define tc_for_each_action(_a, _exts) while (0) + #endif /* CONFIG_NET_CLS_ACT */ #endif diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index 04a31830711b..93c520b83d10 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -16,9 +16,9 @@ struct tcf_gact { #define to_gact(a) \ container_of(a->priv, struct tcf_gact, common) -#ifdef CONFIG_NET_CLS_ACT static inline bool is_tcf_gact_shot(const struct tc_action *a) { +#ifdef CONFIG_NET_CLS_ACT struct tcf_gact *gact; if (a->ops && a->ops->type != TCA_ACT_GACT) @@ -28,7 +28,7 @@ static inline bool is_tcf_gact_shot(const struct tc_action *a) if (gact->tcf_action == TC_ACT_SHOT) return true; +#endif return false; } -#endif #endif /* __NET_TC_GACT_H */ -- cgit v1.2.3 From 519afb1813eab066a0c9995a08861fd0af75d5ae Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Tue, 8 Mar 2016 12:42:32 +0200 Subject: net/act_skbedit: Utility functions for mark action Enable device drivers to query the action, if and only if is a mark action and what value to use for marking. Acked-by: Jiri Pirko Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/net/tc_act/tc_skbedit.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 0df9a0db4a8e..b496d5ad7d42 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -20,6 +20,7 @@ #define __NET_TC_SKBEDIT_H #include +#include struct tcf_skbedit { struct tcf_common common; @@ -32,4 +33,19 @@ struct tcf_skbedit { #define to_skbedit(a) \ container_of(a->priv, struct tcf_skbedit, common) +/* Return true iff action is mark */ +static inline bool is_tcf_skbedit_mark(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + if (a->ops && a->ops->type == TCA_ACT_SKBEDIT) + return to_skbedit(a)->flags == SKBEDIT_F_MARK; +#endif + return false; +} + +static inline u32 tcf_skbedit_mark(const struct tc_action *a) +{ + return to_skbedit(a)->mark; +} + #endif /* __NET_TC_SKBEDIT_H */ -- cgit v1.2.3 From 8208d21bf309551686b7a76d19059ae182a956d0 Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Fri, 11 Mar 2016 11:08:45 +0200 Subject: net/flower: Fix pointer cast Cast pointer to unsigned long instead of u64, to fix compilation warning on 32 bit arch, spotted by 0day build. Fixes: 5b33f48 ("net/flower: Introduce hardware offload support") Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 5b4e8f08b8f0..caa5e18636df 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -416,7 +416,7 @@ enum tc_fl_command { struct tc_cls_flower_offload { enum tc_fl_command command; - u64 cookie; + unsigned long cookie; struct flow_dissector *dissector; struct fl_flow_key *mask; struct fl_flow_key *key; -- cgit v1.2.3 From 4c656c13b254d598e83e586b7b4d36a2043dad85 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 8 Mar 2016 12:59:35 -0800 Subject: bridge: allow zero ageing time This fixes a regression in the bridge ageing time caused by: commit c62987bbd8a1 ("bridge: push bridge setting ageing_time down to switchdev") There are users of Linux bridge which use the feature that if ageing time is set to 0 it causes entries to never expire. See: https://www.linuxfoundation.org/collaborate/workgroups/networking/bridge For a pure software bridge, it is unnecessary for the code to have arbitrary restrictions on what values are allowable. Signed-off-by: Stephen Hemminger Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index a338a688ee4a..dcb89e3515db 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -46,10 +46,6 @@ struct br_ip_list { #define BR_LEARNING_SYNC BIT(9) #define BR_PROXYARP_WIFI BIT(10) -/* values as per ieee8021QBridgeFdbAgingTime */ -#define BR_MIN_AGEING_TIME (10 * HZ) -#define BR_MAX_AGEING_TIME (1000000 * HZ) - #define BR_DEFAULT_AGEING_TIME (300 * HZ) extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); -- cgit v1.2.3 From 134611446dc657e1bbc73ca0e4e6b599df687db0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 9 Mar 2016 03:00:02 +0100 Subject: ip_tunnel: add support for setting flow label via collect metadata This patch extends udp_tunnel6_xmit_skb() to pass in the IPv6 flow label from call sites. Currently, there's no such option and it's always set to zero when writing ip6_flow_hdr(). Add a label member to ip_tunnel_key, so that flow-based tunnels via collect metadata frontends can make use of it. vxlan and geneve will be converted to add flow label support separately. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 5 ++++- include/net/ip_tunnels.h | 4 +++- include/net/udp_tunnel.h | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 84b833af6882..5db9f5910428 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -126,7 +126,7 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, ip_tunnel_key_init(&tun_dst->u.tun_info.key, iph->saddr, iph->daddr, iph->tos, iph->ttl, - 0, 0, tunnel_id, flags); + 0, 0, 0, tunnel_id, flags); return tun_dst; } @@ -152,8 +152,11 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, info->key.u.ipv6.src = ip6h->saddr; info->key.u.ipv6.dst = ip6h->daddr; + info->key.tos = ipv6_get_dsfield(ip6h); info->key.ttl = ip6h->hop_limit; + info->key.label = ip6_flowlabel(ip6h); + return tun_dst; } diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 0acd80fadb32..5dc2e454f866 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -48,6 +48,7 @@ struct ip_tunnel_key { __be16 tun_flags; u8 tos; /* TOS for IPv4, TC for IPv6 */ u8 ttl; /* TTL for IPv4, HL for IPv6 */ + __be32 label; /* Flow Label for IPv6 */ __be16 tp_src; __be16 tp_dst; }; @@ -181,7 +182,7 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, __be32 saddr, __be32 daddr, - u8 tos, u8 ttl, + u8 tos, u8 ttl, __be32 label, __be16 tp_src, __be16 tp_dst, __be64 tun_id, __be16 tun_flags) { @@ -192,6 +193,7 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); key->tos = tos; key->ttl = ttl; + key->label = label; key->tun_flags = tun_flags; /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 97f5adb121a6..b83114077cee 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -88,8 +88,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be16 src_port, - __be16 dst_port, bool nocheck); + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck); #endif void udp_tunnel_sock_release(struct socket *sock); -- cgit v1.2.3 From e7f70af111f086a20800ad2e17f544b2e3e0f375 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 9 Mar 2016 03:00:03 +0100 Subject: vxlan: support setting IPv6 flow label This work adds support for setting the IPv6 flow label for vxlan per device and through collect metadata (ip_tunnel_key) frontends. The vxlan dst cache does not need any special considerations here, for the cases where caches can be used, the label is static per cache. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/vxlan.h | 1 + include/uapi/linux/if_link.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 6eda4ed4d78b..a763c96ecde4 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -162,6 +162,7 @@ struct vxlan_config { u16 port_max; u8 tos; u8 ttl; + __be32 label; u32 flags; unsigned long age_interval; unsigned int addrmax; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index d452cea59020..6bebc975031d 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -456,6 +456,7 @@ enum { IFLA_VXLAN_GBP, IFLA_VXLAN_REMCSUM_NOPARTIAL, IFLA_VXLAN_COLLECT_METADATA, + IFLA_VXLAN_LABEL, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) -- cgit v1.2.3 From 8eb3b99554b82da968d1fbc00df9f3156c5e2d63 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 9 Mar 2016 03:00:04 +0100 Subject: geneve: support setting IPv6 flow label This work adds support for setting the IPv6 flow label for geneve per device and through collect metadata (ip_tunnel_key) frontends. Also here, the geneve dst cache does not need any special considerations, for the cases where caches can be used, the label is static per cache. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6bebc975031d..249eef9a21bd 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -479,6 +479,7 @@ enum { IFLA_GENEVE_UDP_CSUM, IFLA_GENEVE_UDP_ZERO_CSUM6_TX, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, + IFLA_GENEVE_LABEL, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) -- cgit v1.2.3 From 4018ab1875e0d00b84ac61bc15427136ad55849e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 9 Mar 2016 03:00:05 +0100 Subject: bpf: support flow label for bpf_skb_{set, get}_tunnel_key This patch extends bpf_tunnel_key with a tunnel_label member, that maps to ip_tunnel_key's label so underlying backends like vxlan and geneve can propagate the label to udp_tunnel6_xmit_skb(), where it's being set in the IPv6 header. It allows for having 20 more bits to encode/decode flow related meta information programmatically. Tested with vxlan and geneve. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0e30b19012a5..924f537183fd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -375,6 +375,7 @@ struct bpf_tunnel_key { }; __u8 tunnel_tos; __u8 tunnel_ttl; + __u32 tunnel_label; }; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 338039635d01524090e7bd706a3e555e20d5b337 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 9 Mar 2016 09:25:26 -0800 Subject: csum: Update csum_block_add to use rotate instead of byteswap The code for csum_block_add was doing a funky byteswap to swap the even and odd bytes of the checksum if the offset was odd. Instead of doing this we can save ourselves some trouble and just shift by 8 as this should have the same effect in terms of the final checksum value and only requires one instruction. In addition we can update csum_block_sub to just use csum_block_add with a inverse value for csum2. This way we follow the same code path as csum_block_add without having to duplicate it. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/checksum.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/checksum.h b/include/net/checksum.h index abffc64e7300..5c30891e84e5 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -88,8 +88,11 @@ static inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) { u32 sum = (__force u32)csum2; - if (offset&1) - sum = ((sum&0xFF00FF)<<8)+((sum>>8)&0xFF00FF); + + /* rotate sum to align it with a 16b boundary */ + if (offset & 1) + sum = ror32(sum, 8); + return csum_add(csum, (__force __wsum)sum); } @@ -102,10 +105,7 @@ csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) static inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { - u32 sum = (__force u32)csum2; - if (offset&1) - sum = ((sum&0xFF00FF)<<8)+((sum>>8)&0xFF00FF); - return csum_sub(csum, (__force __wsum)sum); + return csum_block_add(csum, ~csum2, offset); } static inline __wsum csum_unfold(__sum16 n) -- cgit v1.2.3 From 136ba622de49a6bf1f6e5eab3391ed5d5dbe30e3 Mon Sep 17 00:00:00 2001 From: Zhang Shengju Date: Thu, 10 Mar 2016 08:55:50 +0000 Subject: netconf: add macro to represent all attributes This patch adds macro NETCONFA_ALL to represent all type of netconf attributes for IPv4 and IPv6. Signed-off-by: Zhang Shengju Signed-off-by: David S. Miller --- include/uapi/linux/netconf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h index 23cbd34e4ac7..45dfad509c4d 100644 --- a/include/uapi/linux/netconf.h +++ b/include/uapi/linux/netconf.h @@ -19,6 +19,7 @@ enum { __NETCONFA_MAX }; #define NETCONFA_MAX (__NETCONFA_MAX - 1) +#define NETCONFA_ALL -1 #define NETCONFA_IFINDEX_ALL -1 #define NETCONFA_IFINDEX_DEFAULT -2 -- cgit v1.2.3 From 6b8abef5f833b03be1b5af491193477ad609ad35 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Thu, 10 Mar 2016 12:30:26 +0000 Subject: xen-netback: re-import canonical netif header The canonical netif header (in the Xen source repo) and the Linux variant have diverged significantly. Recently much documentation has been added to the canonical header which is highly useful for developers making modifications to either xen-netfront or xen-netback. This patch therefore re-imports the canonical header in its entirity. To maintain compatibility and some style consistency with the old Linux variant, the header was stripped of its emacs boilerplate, and post-processed and copied into place with the following commands: ed -s netif.h << EOF H ,s/NETTXF_/XEN_NETTXF_/g ,s/NETRXF_/XEN_NETRXF_/g ,s/NETIF_/XEN_NETIF_/g ,s/XEN_XEN_/XEN_/g ,s/netif/xen_netif/g ,s/xen_xen_/xen_/g ,s/^typedef.*$//g ,s/^ /${TAB}/g w $ w EOF indent --line-length 80 --linux-style netif.h \ -o include/xen/interface/io/netif.h Signed-off-by: Paul Durrant Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Cc: Wei Liu Acked-by: Wei Liu Signed-off-by: David S. Miller --- include/xen/interface/io/netif.h | 861 ++++++++++++++++++++++++++++++++++----- 1 file changed, 766 insertions(+), 95 deletions(-) (limited to 'include') diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h index 252ffd4801ef..4f20dbc42910 100644 --- a/include/xen/interface/io/netif.h +++ b/include/xen/interface/io/netif.h @@ -1,16 +1,34 @@ /****************************************************************************** - * netif.h + * xen_netif.h * * Unified network-device I/O interface for Xen guest OSes. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2003-2004, Keir Fraser */ -#ifndef __XEN_PUBLIC_IO_NETIF_H__ -#define __XEN_PUBLIC_IO_NETIF_H__ +#ifndef __XEN_PUBLIC_IO_XEN_NETIF_H__ +#define __XEN_PUBLIC_IO_XEN_NETIF_H__ -#include -#include +#include "ring.h" +#include "../grant_table.h" /* * Older implementation of Xen network frontend / backend has an @@ -38,10 +56,10 @@ * that it cannot safely queue packets (as it may not be kicked to send them). */ - /* +/* * "feature-split-event-channels" is introduced to separate guest TX - * and RX notificaion. Backend either doesn't support this feature or - * advertise it via xenstore as 0 (disabled) or 1 (enabled). + * and RX notification. Backend either doesn't support this feature or + * advertises it via xenstore as 0 (disabled) or 1 (enabled). * * To make use of this feature, frontend should allocate two event * channels for TX and RX, advertise them to backend as @@ -118,151 +136,804 @@ */ /* - * This is the 'wire' format for packets: - * Request 1: xen_netif_tx_request -- XEN_NETTXF_* (any flags) - * [Request 2: xen_netif_extra_info] (only if request 1 has XEN_NETTXF_extra_info) - * [Request 3: xen_netif_extra_info] (only if request 2 has XEN_NETIF_EXTRA_MORE) - * Request 4: xen_netif_tx_request -- XEN_NETTXF_more_data - * Request 5: xen_netif_tx_request -- XEN_NETTXF_more_data + * "feature-multicast-control" and "feature-dynamic-multicast-control" + * advertise the capability to filter ethernet multicast packets in the + * backend. If the frontend wishes to take advantage of this feature then + * it may set "request-multicast-control". If the backend only advertises + * "feature-multicast-control" then "request-multicast-control" must be set + * before the frontend moves into the connected state. The backend will + * sample the value on this state transition and any subsequent change in + * value will have no effect. However, if the backend also advertises + * "feature-dynamic-multicast-control" then "request-multicast-control" + * may be set by the frontend at any time. In this case, the backend will + * watch the value and re-sample on watch events. + * + * If the sampled value of "request-multicast-control" is set then the + * backend transmit side should no longer flood multicast packets to the + * frontend, it should instead drop any multicast packet that does not + * match in a filter list. + * The list is amended by the frontend by sending dummy transmit requests + * containing XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL} extra-info fragments as + * specified below. + * Note that the filter list may be amended even if the sampled value of + * "request-multicast-control" is not set, however the filter should only + * be applied if it is set. + */ + +/* + * Control ring + * ============ + * + * Some features, such as hashing (detailed below), require a + * significant amount of out-of-band data to be passed from frontend to + * backend. Use of xenstore is not suitable for large quantities of data + * because of quota limitations and so a dedicated 'control ring' is used. + * The ability of the backend to use a control ring is advertised by + * setting: + * + * /local/domain/X/backend///feature-ctrl-ring = "1" + * + * The frontend provides a control ring to the backend by setting: + * + * /local/domain//device/vif//ctrl-ring-ref = + * /local/domain//device/vif//event-channel-ctrl = + * + * where is the grant reference of the shared page used to + * implement the control ring and is an event channel to be used + * as a mailbox interrupt. These keys must be set before the frontend + * moves into the connected state. + * + * The control ring uses a fixed request/response message size and is + * balanced (i.e. one request to one response), so operationally it is much + * the same as a transmit or receive ring. + * Note that there is no requirement that responses are issued in the same + * order as requests. + */ + +/* + * Hash types + * ========== + * + * For the purposes of the definitions below, 'Packet[]' is an array of + * octets containing an IP packet without options, 'Array[X..Y]' means a + * sub-array of 'Array' containing bytes X thru Y inclusive, and '+' is + * used to indicate concatenation of arrays. + */ + +/* + * A hash calculated over an IP version 4 header as follows: + * + * Buffer[0..8] = Packet[12..15] (source address) + + * Packet[16..19] (destination address) + * + * Result = Hash(Buffer, 8) + */ +#define _XEN_NETIF_CTRL_HASH_TYPE_IPV4 0 +#define XEN_NETIF_CTRL_HASH_TYPE_IPV4 \ + (1 << _XEN_NETIF_CTRL_HASH_TYPE_IPV4) + +/* + * A hash calculated over an IP version 4 header and TCP header as + * follows: + * + * Buffer[0..12] = Packet[12..15] (source address) + + * Packet[16..19] (destination address) + + * Packet[20..21] (source port) + + * Packet[22..23] (destination port) + * + * Result = Hash(Buffer, 12) + */ +#define _XEN_NETIF_CTRL_HASH_TYPE_IPV4_TCP 1 +#define XEN_NETIF_CTRL_HASH_TYPE_IPV4_TCP \ + (1 << _XEN_NETIF_CTRL_HASH_TYPE_IPV4_TCP) + +/* + * A hash calculated over an IP version 6 header as follows: + * + * Buffer[0..32] = Packet[8..23] (source address ) + + * Packet[24..39] (destination address) + * + * Result = Hash(Buffer, 32) + */ +#define _XEN_NETIF_CTRL_HASH_TYPE_IPV6 2 +#define XEN_NETIF_CTRL_HASH_TYPE_IPV6 \ + (1 << _XEN_NETIF_CTRL_HASH_TYPE_IPV6) + +/* + * A hash calculated over an IP version 6 header and TCP header as + * follows: + * + * Buffer[0..36] = Packet[8..23] (source address) + + * Packet[24..39] (destination address) + + * Packet[40..41] (source port) + + * Packet[42..43] (destination port) + * + * Result = Hash(Buffer, 36) + */ +#define _XEN_NETIF_CTRL_HASH_TYPE_IPV6_TCP 3 +#define XEN_NETIF_CTRL_HASH_TYPE_IPV6_TCP \ + (1 << _XEN_NETIF_CTRL_HASH_TYPE_IPV6_TCP) + +/* + * Hash algorithms + * =============== + */ + +#define XEN_NETIF_CTRL_HASH_ALGORITHM_NONE 0 + +/* + * Toeplitz hash: + */ + +#define XEN_NETIF_CTRL_HASH_ALGORITHM_TOEPLITZ 1 + +/* + * This algorithm uses a 'key' as well as the data buffer itself. + * (Buffer[] and Key[] are treated as shift-registers where the MSB of + * Buffer/Key[0] is considered 'left-most' and the LSB of Buffer/Key[N-1] + * is the 'right-most'). + * + * Value = 0 + * For number of bits in Buffer[] + * If (left-most bit of Buffer[] is 1) + * Value ^= left-most 32 bits of Key[] + * Key[] << 1 + * Buffer[] << 1 + * + * The code below is provided for convenience where an operating system + * does not already provide an implementation. + */ +#ifdef XEN_NETIF_DEFINE_TOEPLITZ +static uint32_t xen_netif_toeplitz_hash(const uint8_t *key, + unsigned int keylen, + const uint8_t *buf, unsigned int buflen) +{ + unsigned int keyi, bufi; + uint64_t prefix = 0; + uint64_t hash = 0; + + /* Pre-load prefix with the first 8 bytes of the key */ + for (keyi = 0; keyi < 8; keyi++) { + prefix <<= 8; + prefix |= (keyi < keylen) ? key[keyi] : 0; + } + + for (bufi = 0; bufi < buflen; bufi++) { + uint8_t byte = buf[bufi]; + unsigned int bit; + + for (bit = 0; bit < 8; bit++) { + if (byte & 0x80) + hash ^= prefix; + prefix <<= 1; + byte <<= 1; + } + + /* + * 'prefix' has now been left-shifted by 8, so + * OR in the next byte. + */ + prefix |= (keyi < keylen) ? key[keyi] : 0; + keyi++; + } + + /* The valid part of the hash is in the upper 32 bits. */ + return hash >> 32; +} +#endif /* XEN_NETIF_DEFINE_TOEPLITZ */ + +/* + * Control requests (struct xen_netif_ctrl_request) + * ================================================ + * + * All requests have the following format: + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | id | type | data[0] | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | data[1] | data[2] | + * +-----+-----+-----+-----+-----------------------+ + * + * id: the request identifier, echoed in response. + * type: the type of request (see below) + * data[]: any data associated with the request (determined by type) + */ + +struct xen_netif_ctrl_request { + uint16_t id; + uint16_t type; + +#define XEN_NETIF_CTRL_TYPE_INVALID 0 +#define XEN_NETIF_CTRL_TYPE_GET_HASH_FLAGS 1 +#define XEN_NETIF_CTRL_TYPE_SET_HASH_FLAGS 2 +#define XEN_NETIF_CTRL_TYPE_SET_HASH_KEY 3 +#define XEN_NETIF_CTRL_TYPE_GET_HASH_MAPPING_SIZE 4 +#define XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING_SIZE 5 +#define XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING 6 +#define XEN_NETIF_CTRL_TYPE_SET_HASH_ALGORITHM 7 + + uint32_t data[3]; +}; + +/* + * Control responses (struct xen_netif_ctrl_response) + * ================================================== + * + * All responses have the following format: + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | id | type | status | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | data | + * +-----+-----+-----+-----+ + * + * id: the corresponding request identifier + * type: the type of the corresponding request + * status: the status of request processing + * data: any data associated with the response (determined by type and + * status) + */ + +struct xen_netif_ctrl_response { + uint16_t id; + uint16_t type; + uint32_t status; + +#define XEN_NETIF_CTRL_STATUS_SUCCESS 0 +#define XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED 1 +#define XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER 2 +#define XEN_NETIF_CTRL_STATUS_BUFFER_OVERFLOW 3 + + uint32_t data; +}; + +/* + * Control messages + * ================ + * + * XEN_NETIF_CTRL_TYPE_SET_HASH_ALGORITHM + * -------------------------------------- + * + * This is sent by the frontend to set the desired hash algorithm. + * + * Request: + * + * type = XEN_NETIF_CTRL_TYPE_SET_HASH_ALGORITHM + * data[0] = a XEN_NETIF_CTRL_HASH_ALGORITHM_* value + * data[1] = 0 + * data[2] = 0 + * + * Response: + * + * status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED - Operation not + * supported + * XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER - The algorithm is not + * supported + * XEN_NETIF_CTRL_STATUS_SUCCESS - Operation successful + * + * NOTE: Setting data[0] to XEN_NETIF_CTRL_HASH_ALGORITHM_NONE disables + * hashing and the backend is free to choose how it steers packets + * to queues (which is the default behaviour). + * + * XEN_NETIF_CTRL_TYPE_GET_HASH_FLAGS + * ---------------------------------- + * + * This is sent by the frontend to query the types of hash supported by + * the backend. + * + * Request: + * + * type = XEN_NETIF_CTRL_TYPE_GET_HASH_FLAGS + * data[0] = 0 + * data[1] = 0 + * data[2] = 0 + * + * Response: + * + * status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED - Operation not supported + * XEN_NETIF_CTRL_STATUS_SUCCESS - Operation successful + * data = supported hash types (if operation was successful) + * + * NOTE: A valid hash algorithm must be selected before this operation can + * succeed. + * + * XEN_NETIF_CTRL_TYPE_SET_HASH_FLAGS + * ---------------------------------- + * + * This is sent by the frontend to set the types of hash that the backend + * should calculate. (See above for hash type definitions). + * Note that the 'maximal' type of hash should always be chosen. For + * example, if the frontend sets both IPV4 and IPV4_TCP hash types then + * the latter hash type should be calculated for any TCP packet and the + * former only calculated for non-TCP packets. + * + * Request: + * + * type = XEN_NETIF_CTRL_TYPE_SET_HASH_FLAGS + * data[0] = bitwise OR of XEN_NETIF_CTRL_HASH_TYPE_* values + * data[1] = 0 + * data[2] = 0 + * + * Response: + * + * status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED - Operation not + * supported + * XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER - One or more flag + * value is invalid or + * unsupported + * XEN_NETIF_CTRL_STATUS_SUCCESS - Operation successful + * data = 0 + * + * NOTE: A valid hash algorithm must be selected before this operation can + * succeed. + * Also, setting data[0] to zero disables hashing and the backend + * is free to choose how it steers packets to queues. + * + * XEN_NETIF_CTRL_TYPE_SET_HASH_KEY + * -------------------------------- + * + * This is sent by the frontend to set the key of the hash if the algorithm + * requires it. (See hash algorithms above). + * + * Request: + * + * type = XEN_NETIF_CTRL_TYPE_SET_HASH_KEY + * data[0] = grant reference of page containing the key (assumed to + * start at beginning of grant) + * data[1] = size of key in octets + * data[2] = 0 + * + * Response: + * + * status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED - Operation not + * supported + * XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER - Key size is invalid + * XEN_NETIF_CTRL_STATUS_BUFFER_OVERFLOW - Key size is larger + * than the backend + * supports + * XEN_NETIF_CTRL_STATUS_SUCCESS - Operation successful + * data = 0 + * + * NOTE: Any key octets not specified are assumed to be zero (the key + * is assumed to be empty by default) and specifying a new key + * invalidates any previous key, hence specifying a key size of + * zero will clear the key (which ensures that the calculated hash + * will always be zero). + * The maximum size of key is algorithm and backend specific, but + * is also limited by the single grant reference. + * The grant reference may be read-only and must remain valid until + * the response has been processed. + * + * XEN_NETIF_CTRL_TYPE_GET_HASH_MAPPING_SIZE + * ----------------------------------------- + * + * This is sent by the frontend to query the maximum size of mapping + * table supported by the backend. The size is specified in terms of + * table entries. + * + * Request: + * + * type = XEN_NETIF_CTRL_TYPE_GET_HASH_MAPPING_SIZE + * data[0] = 0 + * data[1] = 0 + * data[2] = 0 + * + * Response: + * + * status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED - Operation not supported + * XEN_NETIF_CTRL_STATUS_SUCCESS - Operation successful + * data = maximum number of entries allowed in the mapping table + * (if operation was successful) or zero if a mapping table is + * not supported (i.e. hash mapping is done only by modular + * arithmetic). + * + * XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING_SIZE + * ------------------------------------- + * + * This is sent by the frontend to set the actual size of the mapping + * table to be used by the backend. The size is specified in terms of + * table entries. + * Any previous table is invalidated by this message and any new table + * is assumed to be zero filled. + * + * Request: + * + * type = XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING_SIZE + * data[0] = number of entries in mapping table + * data[1] = 0 + * data[2] = 0 + * + * Response: + * + * status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED - Operation not + * supported + * XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER - Table size is invalid + * XEN_NETIF_CTRL_STATUS_SUCCESS - Operation successful + * data = 0 + * + * NOTE: Setting data[0] to 0 means that hash mapping should be done + * using modular arithmetic. + * + * XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING + * ------------------------------------ + * + * This is sent by the frontend to set the content of the table mapping + * hash value to queue number. The backend should calculate the hash from + * the packet header, use it as an index into the table (modulo the size + * of the table) and then steer the packet to the queue number found at + * that index. + * + * Request: + * + * type = XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING + * data[0] = grant reference of page containing the mapping (sub-)table + * (assumed to start at beginning of grant) + * data[1] = size of (sub-)table in entries + * data[2] = offset, in entries, of sub-table within overall table + * + * Response: + * + * status = XEN_NETIF_CTRL_STATUS_NOT_SUPPORTED - Operation not + * supported + * XEN_NETIF_CTRL_STATUS_INVALID_PARAMETER - Table size or content + * is invalid + * XEN_NETIF_CTRL_STATUS_BUFFER_OVERFLOW - Table size is larger + * than the backend + * supports + * XEN_NETIF_CTRL_STATUS_SUCCESS - Operation successful + * data = 0 + * + * NOTE: The overall table has the following format: + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | mapping[0] | mapping[1] | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | . | + * | . | + * | . | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | mapping[N-2] | mapping[N-1] | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * + * where N is specified by a XEN_NETIF_CTRL_TYPE_SET_HASH_MAPPING_SIZE + * message and each mapping must specifies a queue between 0 and + * "multi-queue-num-queues" (see above). + * The backend may support a mapping table larger than can be + * mapped by a single grant reference. Thus sub-tables within a + * larger table can be individually set by sending multiple messages + * with differing offset values. Specifying a new sub-table does not + * invalidate any table data outside that range. + * The grant reference may be read-only and must remain valid until + * the response has been processed. + */ + +DEFINE_RING_TYPES(xen_netif_ctrl, + struct xen_netif_ctrl_request, + struct xen_netif_ctrl_response); + +/* + * Guest transmit + * ============== + * + * This is the 'wire' format for transmit (frontend -> backend) packets: + * + * Fragment 1: xen_netif_tx_request_t - flags = XEN_NETTXF_* + * size = total packet size + * [Extra 1: xen_netif_extra_info_t] - (only if fragment 1 flags include + * XEN_NETTXF_extra_info) + * ... + * [Extra N: xen_netif_extra_info_t] - (only if extra N-1 flags include + * XEN_NETIF_EXTRA_MORE) * ... - * Request N: xen_netif_tx_request -- 0 + * Fragment N: xen_netif_tx_request_t - (only if fragment N-1 flags include + * XEN_NETTXF_more_data - flags on preceding + * extras are not relevant here) + * flags = 0 + * size = fragment size + * + * NOTE: + * + * This format slightly is different from that used for receive + * (backend -> frontend) packets. Specifically, in a multi-fragment + * packet the actual size of fragment 1 can only be determined by + * subtracting the sizes of fragments 2..N from the total packet size. + * + * Ring slot size is 12 octets, however not all request/response + * structs use the full size. + * + * tx request data (xen_netif_tx_request_t) + * ------------------------------------ + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | grant ref | offset | flags | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | id | size | + * +-----+-----+-----+-----+ + * + * grant ref: Reference to buffer page. + * offset: Offset within buffer page. + * flags: XEN_NETTXF_*. + * id: request identifier, echoed in response. + * size: packet size in bytes. + * + * tx response (xen_netif_tx_response_t) + * --------------------------------- + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | id | status | unused | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | unused | + * +-----+-----+-----+-----+ + * + * id: reflects id in transmit request + * status: XEN_NETIF_RSP_* + * + * Guest receive + * ============= + * + * This is the 'wire' format for receive (backend -> frontend) packets: + * + * Fragment 1: xen_netif_rx_request_t - flags = XEN_NETRXF_* + * size = fragment size + * [Extra 1: xen_netif_extra_info_t] - (only if fragment 1 flags include + * XEN_NETRXF_extra_info) + * ... + * [Extra N: xen_netif_extra_info_t] - (only if extra N-1 flags include + * XEN_NETIF_EXTRA_MORE) + * ... + * Fragment N: xen_netif_rx_request_t - (only if fragment N-1 flags include + * XEN_NETRXF_more_data - flags on preceding + * extras are not relevant here) + * flags = 0 + * size = fragment size + * + * NOTE: + * + * This format slightly is different from that used for transmit + * (frontend -> backend) packets. Specifically, in a multi-fragment + * packet the size of the packet can only be determined by summing the + * sizes of fragments 1..N. + * + * Ring slot size is 8 octets. + * + * rx request (xen_netif_rx_request_t) + * ------------------------------- + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | id | pad | gref | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * + * id: request identifier, echoed in response. + * gref: reference to incoming granted frame. + * + * rx response (xen_netif_rx_response_t) + * --------------------------------- + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | id | offset | flags | status | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * + * id: reflects id in receive request + * offset: offset in page of start of received packet + * flags: XEN_NETRXF_* + * status: -ve: XEN_NETIF_RSP_*; +ve: Rx'ed pkt size. + * + * NOTE: Historically, to support GSO on the frontend receive side, Linux + * netfront does not make use of the rx response id (because, as + * described below, extra info structures overlay the id field). + * Instead it assumes that responses always appear in the same ring + * slot as their corresponding request. Thus, to maintain + * compatibility, backends must make sure this is the case. + * + * Extra Info + * ========== + * + * Can be present if initial request or response has NET{T,R}XF_extra_info, + * or previous extra request has XEN_NETIF_EXTRA_MORE. + * + * The struct therefore needs to fit into either a tx or rx slot and + * is therefore limited to 8 octets. + * + * NOTE: Because extra info data overlays the usual request/response + * structures, there is no id information in the opposite direction. + * So, if an extra info overlays an rx response the frontend can + * assume that it is in the same ring slot as the request that was + * consumed to make the slot available, and the backend must ensure + * this assumption is true. + * + * extra info (xen_netif_extra_info_t) + * ------------------------------- + * + * General format: + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * |type |flags| type specific data | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * | padding for tx | + * +-----+-----+-----+-----+ + * + * type: XEN_NETIF_EXTRA_TYPE_* + * flags: XEN_NETIF_EXTRA_FLAG_* + * padding for tx: present only in the tx case due to 8 octet limit + * from rx case. Not shown in type specific entries + * below. + * + * XEN_NETIF_EXTRA_TYPE_GSO: + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * |type |flags| size |type | pad | features | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * + * type: Must be XEN_NETIF_EXTRA_TYPE_GSO + * flags: XEN_NETIF_EXTRA_FLAG_* + * size: Maximum payload size of each segment. For example, + * for TCP this is just the path MSS. + * type: XEN_NETIF_GSO_TYPE_*: This determines the protocol of + * the packet and any extra features required to segment the + * packet properly. + * features: EN_XEN_NETIF_GSO_FEAT_*: This specifies any extra GSO + * features required to process this packet, such as ECN + * support for TCPv4. + * + * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}: + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * |type |flags| addr | + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * + * type: Must be XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL} + * flags: XEN_NETIF_EXTRA_FLAG_* + * addr: address to add/remove + * + * XEN_NETIF_EXTRA_TYPE_HASH: + * + * A backend that supports teoplitz hashing is assumed to accept + * this type of extra info in transmit packets. + * A frontend that enables hashing is assumed to accept + * this type of extra info in receive packets. + * + * 0 1 2 3 4 5 6 7 octet + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * |type |flags|htype| alg |LSB ---- value ---- MSB| + * +-----+-----+-----+-----+-----+-----+-----+-----+ + * + * type: Must be XEN_NETIF_EXTRA_TYPE_HASH + * flags: XEN_NETIF_EXTRA_FLAG_* + * htype: Hash type (one of _XEN_NETIF_CTRL_HASH_TYPE_* - see above) + * alg: The algorithm used to calculate the hash (one of + * XEN_NETIF_CTRL_HASH_TYPE_ALGORITHM_* - see above) + * value: Hash value */ /* Protocol checksum field is blank in the packet (hardware offload)? */ -#define _XEN_NETTXF_csum_blank (0) -#define XEN_NETTXF_csum_blank (1U<<_XEN_NETTXF_csum_blank) +#define _XEN_NETTXF_csum_blank (0) +#define XEN_NETTXF_csum_blank (1U<<_XEN_NETTXF_csum_blank) /* Packet data has been validated against protocol checksum. */ -#define _XEN_NETTXF_data_validated (1) -#define XEN_NETTXF_data_validated (1U<<_XEN_NETTXF_data_validated) +#define _XEN_NETTXF_data_validated (1) +#define XEN_NETTXF_data_validated (1U<<_XEN_NETTXF_data_validated) /* Packet continues in the next request descriptor. */ -#define _XEN_NETTXF_more_data (2) -#define XEN_NETTXF_more_data (1U<<_XEN_NETTXF_more_data) +#define _XEN_NETTXF_more_data (2) +#define XEN_NETTXF_more_data (1U<<_XEN_NETTXF_more_data) /* Packet to be followed by extra descriptor(s). */ -#define _XEN_NETTXF_extra_info (3) -#define XEN_NETTXF_extra_info (1U<<_XEN_NETTXF_extra_info) +#define _XEN_NETTXF_extra_info (3) +#define XEN_NETTXF_extra_info (1U<<_XEN_NETTXF_extra_info) #define XEN_NETIF_MAX_TX_SIZE 0xFFFF struct xen_netif_tx_request { - grant_ref_t gref; /* Reference to buffer page */ - uint16_t offset; /* Offset within buffer page */ - uint16_t flags; /* XEN_NETTXF_* */ - uint16_t id; /* Echoed in response message. */ - uint16_t size; /* Packet size in bytes. */ + grant_ref_t gref; + uint16_t offset; + uint16_t flags; + uint16_t id; + uint16_t size; }; /* Types of xen_netif_extra_info descriptors. */ -#define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */ -#define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */ -#define XEN_NETIF_EXTRA_TYPE_MCAST_ADD (2) /* u.mcast */ -#define XEN_NETIF_EXTRA_TYPE_MCAST_DEL (3) /* u.mcast */ -#define XEN_NETIF_EXTRA_TYPE_MAX (4) +#define XEN_NETIF_EXTRA_TYPE_NONE (0) /* Never used - invalid */ +#define XEN_NETIF_EXTRA_TYPE_GSO (1) /* u.gso */ +#define XEN_NETIF_EXTRA_TYPE_MCAST_ADD (2) /* u.mcast */ +#define XEN_NETIF_EXTRA_TYPE_MCAST_DEL (3) /* u.mcast */ +#define XEN_NETIF_EXTRA_TYPE_HASH (4) /* u.hash */ +#define XEN_NETIF_EXTRA_TYPE_MAX (5) -/* xen_netif_extra_info flags. */ -#define _XEN_NETIF_EXTRA_FLAG_MORE (0) -#define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE) +/* xen_netif_extra_info_t flags. */ +#define _XEN_NETIF_EXTRA_FLAG_MORE (0) +#define XEN_NETIF_EXTRA_FLAG_MORE (1U<<_XEN_NETIF_EXTRA_FLAG_MORE) /* GSO types */ -#define XEN_NETIF_GSO_TYPE_NONE (0) -#define XEN_NETIF_GSO_TYPE_TCPV4 (1) -#define XEN_NETIF_GSO_TYPE_TCPV6 (2) +#define XEN_NETIF_GSO_TYPE_NONE (0) +#define XEN_NETIF_GSO_TYPE_TCPV4 (1) +#define XEN_NETIF_GSO_TYPE_TCPV6 (2) /* - * This structure needs to fit within both netif_tx_request and - * netif_rx_response for compatibility. + * This structure needs to fit within both xen_netif_tx_request_t and + * xen_netif_rx_response_t for compatibility. */ struct xen_netif_extra_info { - uint8_t type; /* XEN_NETIF_EXTRA_TYPE_* */ - uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */ - + uint8_t type; + uint8_t flags; union { struct { - /* - * Maximum payload size of each segment. For - * example, for TCP this is just the path MSS. - */ uint16_t size; - - /* - * GSO type. This determines the protocol of - * the packet and any extra features required - * to segment the packet properly. - */ - uint8_t type; /* XEN_NETIF_GSO_TYPE_* */ - - /* Future expansion. */ + uint8_t type; uint8_t pad; - - /* - * GSO features. This specifies any extra GSO - * features required to process this packet, - * such as ECN support for TCPv4. - */ - uint16_t features; /* XEN_NETIF_GSO_FEAT_* */ + uint16_t features; } gso; - struct { - uint8_t addr[6]; /* Address to add/remove. */ + uint8_t addr[6]; } mcast; - + struct { + uint8_t type; + uint8_t algorithm; + uint8_t value[4]; + } hash; uint16_t pad[3]; } u; }; struct xen_netif_tx_response { uint16_t id; - int16_t status; /* XEN_NETIF_RSP_* */ + int16_t status; }; struct xen_netif_rx_request { - uint16_t id; /* Echoed in response message. */ - grant_ref_t gref; /* Reference to incoming granted frame */ + uint16_t id; /* Echoed in response message. */ + uint16_t pad; + grant_ref_t gref; }; /* Packet data has been validated against protocol checksum. */ -#define _XEN_NETRXF_data_validated (0) -#define XEN_NETRXF_data_validated (1U<<_XEN_NETRXF_data_validated) +#define _XEN_NETRXF_data_validated (0) +#define XEN_NETRXF_data_validated (1U<<_XEN_NETRXF_data_validated) /* Protocol checksum field is blank in the packet (hardware offload)? */ -#define _XEN_NETRXF_csum_blank (1) -#define XEN_NETRXF_csum_blank (1U<<_XEN_NETRXF_csum_blank) +#define _XEN_NETRXF_csum_blank (1) +#define XEN_NETRXF_csum_blank (1U<<_XEN_NETRXF_csum_blank) /* Packet continues in the next request descriptor. */ -#define _XEN_NETRXF_more_data (2) -#define XEN_NETRXF_more_data (1U<<_XEN_NETRXF_more_data) +#define _XEN_NETRXF_more_data (2) +#define XEN_NETRXF_more_data (1U<<_XEN_NETRXF_more_data) /* Packet to be followed by extra descriptor(s). */ -#define _XEN_NETRXF_extra_info (3) -#define XEN_NETRXF_extra_info (1U<<_XEN_NETRXF_extra_info) +#define _XEN_NETRXF_extra_info (3) +#define XEN_NETRXF_extra_info (1U<<_XEN_NETRXF_extra_info) -/* GSO Prefix descriptor. */ -#define _XEN_NETRXF_gso_prefix (4) -#define XEN_NETRXF_gso_prefix (1U<<_XEN_NETRXF_gso_prefix) +/* Packet has GSO prefix. Deprecated but included for compatibility */ +#define _XEN_NETRXF_gso_prefix (4) +#define XEN_NETRXF_gso_prefix (1U<<_XEN_NETRXF_gso_prefix) struct xen_netif_rx_response { - uint16_t id; - uint16_t offset; /* Offset in page of start of received packet */ - uint16_t flags; /* XEN_NETRXF_* */ - int16_t status; /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */ + uint16_t id; + uint16_t offset; + uint16_t flags; + int16_t status; }; /* - * Generate netif ring structures and types. + * Generate xen_netif ring structures and types. */ -DEFINE_RING_TYPES(xen_netif_tx, - struct xen_netif_tx_request, +DEFINE_RING_TYPES(xen_netif_tx, struct xen_netif_tx_request, struct xen_netif_tx_response); -DEFINE_RING_TYPES(xen_netif_rx, - struct xen_netif_rx_request, +DEFINE_RING_TYPES(xen_netif_rx, struct xen_netif_rx_request, struct xen_netif_rx_response); -#define XEN_NETIF_RSP_DROPPED -2 -#define XEN_NETIF_RSP_ERROR -1 -#define XEN_NETIF_RSP_OKAY 0 -/* No response: used for auxiliary requests (e.g., xen_netif_extra_info). */ -#define XEN_NETIF_RSP_NULL 1 +#define XEN_NETIF_RSP_DROPPED -2 +#define XEN_NETIF_RSP_ERROR -1 +#define XEN_NETIF_RSP_OKAY 0 +/* No response: used for auxiliary requests (e.g., xen_netif_extra_info_t). */ +#define XEN_NETIF_RSP_NULL 1 #endif -- cgit v1.2.3 From 470c3822d2ab7fadcbb1ac317ef27b31caac370e Mon Sep 17 00:00:00 2001 From: LABBE Corentin Date: Thu, 10 Mar 2016 13:58:58 +0100 Subject: phy: remove documentation of removed members of phy_device structure Commit e5a03bfd873c ("phy: Add an mdio_device structure") removed addr, bus and dev member of the phy_device structure. This patch remove the documentation about those members. Signed-off-by: LABBE Corentin Reviewed-by: Andrew Lunn Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index d6f3641e7933..2abd7918f64f 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -327,8 +327,6 @@ struct phy_c45_device_ids { /* phy_device: An instance of a PHY * * drv: Pointer to the driver for this PHY instance - * bus: Pointer to the bus this PHY is on - * dev: driver model device structure for this PHY * phy_id: UID for this device found during discovery * c45_ids: 802.3-c45 Device Identifers if is_c45. * is_c45: Set to true if this phy uses clause 45 addressing. @@ -338,7 +336,6 @@ struct phy_c45_device_ids { * suspended: Set to true if this phy has been suspended successfully. * state: state of the PHY for management purposes * dev_flags: Device-specific flags used by the PHY driver. - * addr: Bus address of PHY * link_timeout: The number of timer firings to wait before the * giving up on the current attempt at acquiring a link * irq: IRQ number of the PHY's interrupt (-1 if none) -- cgit v1.2.3 From cea8768f333e3f0bc231d8b815aa4a9e63fa990c Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 10 Mar 2016 18:33:07 -0300 Subject: sctp: allow sctp_transmit_packet and others to use gfp Currently sctp_sendmsg() triggers some calls that will allocate memory with GFP_ATOMIC even when not necessary. In the case of sctp_packet_transmit it will allocate a linear skb that will be used to construct the packet and this may cause sends to fail due to ENOMEM more often than anticipated specially with big MTUs. This patch thus allows it to inherit gfp flags from upper calls so that it can use GFP_KERNEL if it was triggered by a sctp_sendmsg call or similar. All others, like retransmits or flushes started from BH, are still allocated using GFP_ATOMIC. In netperf tests this didn't result in any performance drawbacks when memory is not too fragmented and made it trigger ENOMEM way less often. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 2 +- include/net/sctp/structs.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 487ef34bbd63..efc01743b9d6 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -201,7 +201,7 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *, struct sctp_chunk * sctp_make_datafrag_empty(struct sctp_association *, const struct sctp_sndrcvinfo *sinfo, int len, const __u8 flags, - __u16 ssn); + __u16 ssn, gfp_t gfp); struct sctp_chunk *sctp_make_ecne(const struct sctp_association *, const __u32); struct sctp_chunk *sctp_make_sack(const struct sctp_association *); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index d05b56641abc..9d237669c52c 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -655,7 +655,7 @@ void sctp_chunk_free(struct sctp_chunk *); void *sctp_addto_chunk(struct sctp_chunk *, int len, const void *data); struct sctp_chunk *sctp_chunkify(struct sk_buff *, const struct sctp_association *, - struct sock *); + struct sock *, gfp_t gfp); void sctp_init_addrs(struct sctp_chunk *, union sctp_addr *, union sctp_addr *); const union sctp_addr *sctp_source(const struct sctp_chunk *chunk); @@ -717,10 +717,10 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *, __u16 sport, __u16 dport); struct sctp_packet *sctp_packet_config(struct sctp_packet *, __u32 vtag, int); sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *, - struct sctp_chunk *, int); + struct sctp_chunk *, int, gfp_t); sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *, struct sctp_chunk *); -int sctp_packet_transmit(struct sctp_packet *); +int sctp_packet_transmit(struct sctp_packet *, gfp_t); void sctp_packet_free(struct sctp_packet *); static inline int sctp_packet_empty(struct sctp_packet *packet) @@ -1053,7 +1053,7 @@ struct sctp_outq { void sctp_outq_init(struct sctp_association *, struct sctp_outq *); void sctp_outq_teardown(struct sctp_outq *); void sctp_outq_free(struct sctp_outq*); -int sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk); +int sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t); int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *); int sctp_outq_is_empty(const struct sctp_outq *); void sctp_outq_restart(struct sctp_outq *); @@ -1061,7 +1061,7 @@ void sctp_outq_restart(struct sctp_outq *); void sctp_retransmit(struct sctp_outq *, struct sctp_transport *, sctp_retransmit_reason_t); void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8); -int sctp_outq_uncork(struct sctp_outq *); +int sctp_outq_uncork(struct sctp_outq *, gfp_t gfp); /* Uncork and flush an outqueue. */ static inline void sctp_outq_cork(struct sctp_outq *q) { -- cgit v1.2.3 From dece8d2b78d19df7fe5e4e965f1f0d1a3e188d1b Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 11 Mar 2016 18:07:31 +0100 Subject: uapi: add MACsec bits Signed-off-by: Sabrina Dubroca Reviewed-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/Kbuild | 1 + include/uapi/linux/if_ether.h | 1 + include/uapi/linux/if_link.h | 29 ++++++++ include/uapi/linux/if_macsec.h | 161 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 192 insertions(+) create mode 100644 include/uapi/linux/if_macsec.h (limited to 'include') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index ebd10e624598..e25ebcfbcb48 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -173,6 +173,7 @@ header-y += if_hippi.h header-y += if_infiniband.h header-y += if_link.h header-y += if_ltalk.h +header-y += if_macsec.h header-y += if_packet.h header-y += if_phonet.h header-y += if_plip.h diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index ea9221b0331a..4a93051c578c 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -83,6 +83,7 @@ #define ETH_P_8021AD 0x88A8 /* 802.1ad Service VLAN */ #define ETH_P_802_EX1 0x88B5 /* 802.1 Local Experimental 1. */ #define ETH_P_TIPC 0x88CA /* TIPC */ +#define ETH_P_MACSEC 0x88E5 /* 802.1ae MACsec */ #define ETH_P_8021AH 0x88E7 /* 802.1ah Backbone Service Tag */ #define ETH_P_MVRP 0x88F5 /* 802.1Q MVRP */ #define ETH_P_1588 0x88F7 /* IEEE 1588 Timesync */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 249eef9a21bd..8e3f88fa5b59 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -413,6 +413,35 @@ enum { #define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1) +/* MACSEC section */ +enum { + IFLA_MACSEC_UNSPEC, + IFLA_MACSEC_SCI, + IFLA_MACSEC_PORT, + IFLA_MACSEC_ICV_LEN, + IFLA_MACSEC_CIPHER_SUITE, + IFLA_MACSEC_WINDOW, + IFLA_MACSEC_ENCODING_SA, + IFLA_MACSEC_ENCRYPT, + IFLA_MACSEC_PROTECT, + IFLA_MACSEC_INC_SCI, + IFLA_MACSEC_ES, + IFLA_MACSEC_SCB, + IFLA_MACSEC_REPLAY_PROTECT, + IFLA_MACSEC_VALIDATION, + __IFLA_MACSEC_MAX, +}; + +#define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) + +enum macsec_validation_type { + MACSEC_VALIDATE_DISABLED = 0, + MACSEC_VALIDATE_CHECK = 1, + MACSEC_VALIDATE_STRICT = 2, + __MACSEC_VALIDATE_END, + MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1, +}; + /* IPVLAN section */ enum { IFLA_IPVLAN_UNSPEC, diff --git a/include/uapi/linux/if_macsec.h b/include/uapi/linux/if_macsec.h new file mode 100644 index 000000000000..26b0d1e3e3e7 --- /dev/null +++ b/include/uapi/linux/if_macsec.h @@ -0,0 +1,161 @@ +/* + * include/uapi/linux/if_macsec.h - MACsec device + * + * Copyright (c) 2015 Sabrina Dubroca + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef _UAPI_MACSEC_H +#define _UAPI_MACSEC_H + +#include + +#define MACSEC_GENL_NAME "macsec" +#define MACSEC_GENL_VERSION 1 + +#define MACSEC_MAX_KEY_LEN 128 + +#define DEFAULT_CIPHER_ID 0x0080020001000001ULL +#define DEFAULT_CIPHER_ALT 0x0080C20001000001ULL + +#define MACSEC_MIN_ICV_LEN 8 +#define MACSEC_MAX_ICV_LEN 32 + +enum macsec_attrs { + MACSEC_ATTR_UNSPEC, + MACSEC_ATTR_IFINDEX, /* u32, ifindex of the MACsec netdevice */ + MACSEC_ATTR_RXSC_CONFIG, /* config, nested macsec_rxsc_attrs */ + MACSEC_ATTR_SA_CONFIG, /* config, nested macsec_sa_attrs */ + MACSEC_ATTR_SECY, /* dump, nested macsec_secy_attrs */ + MACSEC_ATTR_TXSA_LIST, /* dump, nested, macsec_sa_attrs for each TXSA */ + MACSEC_ATTR_RXSC_LIST, /* dump, nested, macsec_rxsc_attrs for each RXSC */ + MACSEC_ATTR_TXSC_STATS, /* dump, nested, macsec_txsc_stats_attr */ + MACSEC_ATTR_SECY_STATS, /* dump, nested, macsec_secy_stats_attr */ + __MACSEC_ATTR_END, + NUM_MACSEC_ATTR = __MACSEC_ATTR_END, + MACSEC_ATTR_MAX = __MACSEC_ATTR_END - 1, +}; + +enum macsec_secy_attrs { + MACSEC_SECY_ATTR_UNSPEC, + MACSEC_SECY_ATTR_SCI, + MACSEC_SECY_ATTR_ENCODING_SA, + MACSEC_SECY_ATTR_WINDOW, + MACSEC_SECY_ATTR_CIPHER_SUITE, + MACSEC_SECY_ATTR_ICV_LEN, + MACSEC_SECY_ATTR_PROTECT, + MACSEC_SECY_ATTR_REPLAY, + MACSEC_SECY_ATTR_OPER, + MACSEC_SECY_ATTR_VALIDATE, + MACSEC_SECY_ATTR_ENCRYPT, + MACSEC_SECY_ATTR_INC_SCI, + MACSEC_SECY_ATTR_ES, + MACSEC_SECY_ATTR_SCB, + __MACSEC_SECY_ATTR_END, + NUM_MACSEC_SECY_ATTR = __MACSEC_SECY_ATTR_END, + MACSEC_SECY_ATTR_MAX = __MACSEC_SECY_ATTR_END - 1, +}; + +enum macsec_rxsc_attrs { + MACSEC_RXSC_ATTR_UNSPEC, + MACSEC_RXSC_ATTR_SCI, /* config/dump, u64 */ + MACSEC_RXSC_ATTR_ACTIVE, /* config/dump, u8 0..1 */ + MACSEC_RXSC_ATTR_SA_LIST, /* dump, nested */ + MACSEC_RXSC_ATTR_STATS, /* dump, nested, macsec_rxsc_stats_attr */ + __MACSEC_RXSC_ATTR_END, + NUM_MACSEC_RXSC_ATTR = __MACSEC_RXSC_ATTR_END, + MACSEC_RXSC_ATTR_MAX = __MACSEC_RXSC_ATTR_END - 1, +}; + +enum macsec_sa_attrs { + MACSEC_SA_ATTR_UNSPEC, + MACSEC_SA_ATTR_AN, /* config/dump, u8 0..3 */ + MACSEC_SA_ATTR_ACTIVE, /* config/dump, u8 0..1 */ + MACSEC_SA_ATTR_PN, /* config/dump, u32 */ + MACSEC_SA_ATTR_KEY, /* config, data */ + MACSEC_SA_ATTR_KEYID, /* config/dump, u64 */ + MACSEC_SA_ATTR_STATS, /* dump, nested, macsec_sa_stats_attr */ + __MACSEC_SA_ATTR_END, + NUM_MACSEC_SA_ATTR = __MACSEC_SA_ATTR_END, + MACSEC_SA_ATTR_MAX = __MACSEC_SA_ATTR_END - 1, +}; + +enum macsec_nl_commands { + MACSEC_CMD_GET_TXSC, + MACSEC_CMD_ADD_RXSC, + MACSEC_CMD_DEL_RXSC, + MACSEC_CMD_UPD_RXSC, + MACSEC_CMD_ADD_TXSA, + MACSEC_CMD_DEL_TXSA, + MACSEC_CMD_UPD_TXSA, + MACSEC_CMD_ADD_RXSA, + MACSEC_CMD_DEL_RXSA, + MACSEC_CMD_UPD_RXSA, +}; + +/* u64 per-RXSC stats */ +enum macsec_rxsc_stats_attr { + MACSEC_RXSC_STATS_ATTR_UNSPEC, + MACSEC_RXSC_STATS_ATTR_IN_OCTETS_VALIDATED, + MACSEC_RXSC_STATS_ATTR_IN_OCTETS_DECRYPTED, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNCHECKED, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_DELAYED, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_OK, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_INVALID, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_LATE, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_VALID, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_USING_SA, + MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNUSED_SA, + __MACSEC_RXSC_STATS_ATTR_END, + NUM_MACSEC_RXSC_STATS_ATTR = __MACSEC_RXSC_STATS_ATTR_END, + MACSEC_RXSC_STATS_ATTR_MAX = __MACSEC_RXSC_STATS_ATTR_END - 1, +}; + +/* u32 per-{RX,TX}SA stats */ +enum macsec_sa_stats_attr { + MACSEC_SA_STATS_ATTR_UNSPEC, + MACSEC_SA_STATS_ATTR_IN_PKTS_OK, + MACSEC_SA_STATS_ATTR_IN_PKTS_INVALID, + MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_VALID, + MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_USING_SA, + MACSEC_SA_STATS_ATTR_IN_PKTS_UNUSED_SA, + MACSEC_SA_STATS_ATTR_OUT_PKTS_PROTECTED, + MACSEC_SA_STATS_ATTR_OUT_PKTS_ENCRYPTED, + __MACSEC_SA_STATS_ATTR_END, + NUM_MACSEC_SA_STATS_ATTR = __MACSEC_SA_STATS_ATTR_END, + MACSEC_SA_STATS_ATTR_MAX = __MACSEC_SA_STATS_ATTR_END - 1, +}; + +/* u64 per-TXSC stats */ +enum macsec_txsc_stats_attr { + MACSEC_TXSC_STATS_ATTR_UNSPEC, + MACSEC_TXSC_STATS_ATTR_OUT_PKTS_PROTECTED, + MACSEC_TXSC_STATS_ATTR_OUT_PKTS_ENCRYPTED, + MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_PROTECTED, + MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_ENCRYPTED, + __MACSEC_TXSC_STATS_ATTR_END, + NUM_MACSEC_TXSC_STATS_ATTR = __MACSEC_TXSC_STATS_ATTR_END, + MACSEC_TXSC_STATS_ATTR_MAX = __MACSEC_TXSC_STATS_ATTR_END - 1, +}; + +/* u64 per-SecY stats */ +enum macsec_secy_stats_attr { + MACSEC_SECY_STATS_ATTR_UNSPEC, + MACSEC_SECY_STATS_ATTR_OUT_PKTS_UNTAGGED, + MACSEC_SECY_STATS_ATTR_IN_PKTS_UNTAGGED, + MACSEC_SECY_STATS_ATTR_OUT_PKTS_TOO_LONG, + MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_TAG, + MACSEC_SECY_STATS_ATTR_IN_PKTS_BAD_TAG, + MACSEC_SECY_STATS_ATTR_IN_PKTS_UNKNOWN_SCI, + MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_SCI, + MACSEC_SECY_STATS_ATTR_IN_PKTS_OVERRUN, + __MACSEC_SECY_STATS_ATTR_END, + NUM_MACSEC_SECY_STATS_ATTR = __MACSEC_SECY_STATS_ATTR_END, + MACSEC_SECY_STATS_ATTR_MAX = __MACSEC_SECY_STATS_ATTR_END - 1, +}; + +#endif /* _UAPI_MACSEC_H */ -- cgit v1.2.3 From 3c17578473b9be5a6e7680a45ea97e1d56e13249 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 11 Mar 2016 18:07:32 +0100 Subject: net: add MACsec netdevice priv_flags and helper Signed-off-by: Sabrina Dubroca Reviewed-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 41df0b450757..be693b34662f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1328,6 +1328,7 @@ struct net_device_ops { * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external * entity (i.e. the master device for bridged veth) + * @IFF_MACSEC: device is a MACsec device */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1357,6 +1358,7 @@ enum netdev_priv_flags { IFF_TEAM = 1<<24, IFF_RXFH_CONFIGURED = 1<<25, IFF_PHONY_HEADROOM = 1<<26, + IFF_MACSEC = 1<<27, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1385,6 +1387,7 @@ enum netdev_priv_flags { #define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE #define IFF_TEAM IFF_TEAM #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED +#define IFF_MACSEC IFF_MACSEC /** * struct net_device - The DEVICE structure. @@ -4045,6 +4048,11 @@ static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, skb->mac_len = mac_len; } +static inline bool netif_is_macsec(const struct net_device *dev) +{ + return dev->priv_flags & IFF_MACSEC; +} + static inline bool netif_is_macvlan(const struct net_device *dev) { return dev->priv_flags & IFF_MACVLAN; -- cgit v1.2.3 From 01cfbad79a5e2b835abf6a8154a341d75a6fc8cd Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 11 Mar 2016 14:05:34 -0800 Subject: ipv4: Update parameters for csum_tcpudp_magic to their original types This patch updates all instances of csum_tcpudp_magic and csum_tcpudp_nofold to reflect the types that are usually used as the source inputs. For example the protocol field is populated based on nexthdr which is actually an unsigned 8 bit value. The length is usually populated based on skb->len which is an unsigned integer. This addresses an issue in which the IPv6 function csum_ipv6_magic was generating a checksum using the full 32b of skb->len while csum_tcpudp_magic was only using the lower 16 bits. As a result we could run into issues when attempting to adjust the checksum as there was no protocol agnostic way to update it. With this change the value is still truncated as many architectures use "(len + proto) << 8", however this truncation only occurs for values greater than 16776960 in length and as such is unlikely to occur as we stop the inner headers at ~64K in size. I did have to make a few minor changes in the arm, mn10300, nios2, and score versions of the function in order to support these changes as they were either using things such as an OR to combine the protocol and length, or were using ntohs to convert the length which would have truncated the value. I also updated a few spots in terms of whitespace and type differences for the addresses. Most of this was just to make sure all of the definitions were in sync going forward. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/asm-generic/checksum.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/asm-generic/checksum.h b/include/asm-generic/checksum.h index 59811df58c5b..3150cbd8eb21 100644 --- a/include/asm-generic/checksum.h +++ b/include/asm-generic/checksum.h @@ -65,14 +65,14 @@ static inline __sum16 csum_fold(__wsum csum) * returns a 16-bit checksum, already complemented */ extern __wsum -csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len, - unsigned short proto, __wsum sum); +csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, + __u8 proto, __wsum sum); #endif #ifndef csum_tcpudp_magic static inline __sum16 -csum_tcpudp_magic(__be32 saddr, __be32 daddr, unsigned short len, - unsigned short proto, __wsum sum) +csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, + __u8 proto, __wsum sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } -- cgit v1.2.3 From 1e94082963747b551b129528714827f76a090e93 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 11 Mar 2016 14:05:41 -0800 Subject: ipv6: Pass proto to csum_ipv6_magic as __u8 instead of unsigned short This patch updates csum_ipv6_magic so that it correctly recognizes that protocol is a unsigned 8 bit value. This will allow us to better understand what limitations may or may not be present in how we handle the data. For example there are a number of places that call htonl on the protocol value. This is likely not necessary and can be replaced with a multiplication by ntohl(1) which will be converted to a shift by the compiler. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/ip6_checksum.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h index 1a49b73f7f6e..cca840584c88 100644 --- a/include/net/ip6_checksum.h +++ b/include/net/ip6_checksum.h @@ -37,8 +37,7 @@ #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, - __u32 len, unsigned short proto, - __wsum csum); + __u32 len, __u8 proto, __wsum csum); #endif static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto) -- cgit v1.2.3 From f2900acea8018c4525ddaa86c7f7cd8afd3f0cc4 Mon Sep 17 00:00:00 2001 From: Marcin Wojtas Date: Mon, 14 Mar 2016 09:39:02 +0100 Subject: bus: mvebu-mbus: provide api for obtaining IO and DRAM window information This commit enables finding appropriate mbus window and obtaining its target id and attribute for given physical address in two separate routines, both for IO and DRAM windows. This functionality is needed for Armada XP/38x Network Controller's Buffer Manager and PnC configuration. [gregory.clement@free-electrons.com: Fix size test for mvebu_mbus_get_dram_win_info] Signed-off-by: Marcin Wojtas [DRAM window information reference in LKv3.10] Signed-off-by: Evan Wang Signed-off-by: Gregory CLEMENT Signed-off-by: David S. Miller --- include/linux/mbus.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mbus.h b/include/linux/mbus.h index 1f7bc630d225..ea34a867caa0 100644 --- a/include/linux/mbus.h +++ b/include/linux/mbus.h @@ -69,6 +69,9 @@ static inline const struct mbus_dram_target_info *mv_mbus_dram_info_nooverlap(vo int mvebu_mbus_save_cpu_target(u32 *store_addr); void mvebu_mbus_get_pcie_mem_aperture(struct resource *res); void mvebu_mbus_get_pcie_io_aperture(struct resource *res); +int mvebu_mbus_get_dram_win_info(phys_addr_t phyaddr, u8 *target, u8 *attr); +int mvebu_mbus_get_io_win_info(phys_addr_t phyaddr, u32 *size, u8 *target, + u8 *attr); int mvebu_mbus_add_window_remap_by_id(unsigned int target, unsigned int attribute, phys_addr_t base, size_t size, -- cgit v1.2.3 From 8cb2d8bf57e6e004c37db2fb4ce74f4d032b7cd0 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Mon, 14 Mar 2016 09:39:04 +0100 Subject: net: add a hardware buffer management helper API This basic implementation allows to share code between driver using hardware buffer management. As the code is hardware agnostic, there is few helpers, most of the optimization brought by the an HW BM has to be done at driver level. Tested-by: Sebastian Careba Signed-off-by: Gregory CLEMENT Signed-off-by: David S. Miller --- include/net/hwbm.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 include/net/hwbm.h (limited to 'include') diff --git a/include/net/hwbm.h b/include/net/hwbm.h new file mode 100644 index 000000000000..47d08662501b --- /dev/null +++ b/include/net/hwbm.h @@ -0,0 +1,28 @@ +#ifndef _HWBM_H +#define _HWBM_H + +struct hwbm_pool { + /* Capacity of the pool */ + int size; + /* Size of the buffers managed */ + int frag_size; + /* Number of buffers currently used by this pool */ + int buf_num; + /* constructor called during alocation */ + int (*construct)(struct hwbm_pool *bm_pool, void *buf); + /* protect acces to the buffer counter*/ + spinlock_t lock; + /* private data */ + void *priv; +}; +#ifdef CONFIG_HWBM +void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf); +int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp); +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp); +#else +void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf) {} +int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp) { return 0; } +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp) +{ return 0; } +#endif /* CONFIG_HWBM */ +#endif /* _HWBM_H */ -- cgit v1.2.3 From a44d6eacdaf56f74fad699af7f4925a5f5ac0e7f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 14 Mar 2016 10:52:15 -0700 Subject: tcp: Add RFC4898 tcpEStatsPerfDataSegsOut/In Per RFC4898, they count segments sent/received containing a positive length data segment (that includes retransmission segments carrying data). Unlike tcpi_segs_out/in, tcpi_data_segs_out/in excludes segments carrying no data (e.g. pure ack). The patch also updates the segs_in in tcp_fastopen_add_skb() so that segs_in >= data_segs_in property is kept. Together with retransmission data, tcpi_data_segs_out gives a better signal on the rxmit rate. v6: Rebase on the latest net-next v5: Eric pointed out that checking skb->len is still needed in tcp_fastopen_add_skb() because skb can carry a FIN without data. Hence, instead of open coding segs_in and data_segs_in, tcp_segs_in() helper is used. Comment is added to the fastopen case to explain why segs_in has to be reset and tcp_segs_in() has to be called before __skb_pull(). v4: Add comment to the changes in tcp_fastopen_add_skb() and also add remark on this case in the commit message. v3: Add const modifier to the skb parameter in tcp_segs_in() v2: Rework based on recent fix by Eric: commit a9d99ce28ed3 ("tcp: fix tcpi_segs_in after connection establishment") Signed-off-by: Martin KaFai Lau Cc: Chris Rapier Cc: Eric Dumazet Cc: Marcelo Ricardo Leitner Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 6 ++++++ include/net/tcp.h | 10 ++++++++++ include/uapi/linux/tcp.h | 2 ++ 3 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index bcbf51da4e1e..7be9b1242354 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -158,6 +158,9 @@ struct tcp_sock { u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn * total number of segments in. */ + u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn + * total number of data segments in. + */ u32 rcv_nxt; /* What we want to receive next */ u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ @@ -165,6 +168,9 @@ struct tcp_sock { u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut * The total number of segments sent. */ + u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. diff --git a/include/net/tcp.h b/include/net/tcp.h index 0302636af98c..c8dbd293daae 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1840,4 +1840,14 @@ static inline int tcp_inq(struct sock *sk) return answ; } +static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) +{ + u16 segs_in; + + segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs); + tp->segs_in += segs_in; + if (skb->len > tcp_hdrlen(skb)) + tp->data_segs_in += segs_in; +} + #endif /* _TCP_H */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index fe95446e9abf..53e8e3fe6b1b 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -199,6 +199,8 @@ struct tcp_info { __u32 tcpi_notsent_bytes; __u32 tcpi_min_rtt; + __u32 tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */ + __u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */ }; /* for TCP_MD5SIG socket option */ -- cgit v1.2.3 From 5bcbe0f35fb13e31fdd0b2dc9695f19ab0208207 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 12 Mar 2016 00:01:40 +0100 Subject: phy: fixed: Fix removal of phys. The fixed phys delete function simply removed the fixed phy from the internal linked list and freed the memory. It however did not unregister the associated phy device. This meant it was still possible to find the phy device on the mdio bus. Make fixed_phy_del() an internal function and add a fixed_phy_unregister() to unregisters the phy device and then uses fixed_phy_del() to free resources. Modify DSA to use this new API function, so we don't leak phys. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy_fixed.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 2400d2ea4f34..1d41ec44e39d 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -19,7 +19,7 @@ extern struct phy_device *fixed_phy_register(unsigned int irq, struct fixed_phy_status *status, int link_gpio, struct device_node *np); -extern void fixed_phy_del(int phy_addr); +extern void fixed_phy_unregister(struct phy_device *phydev); extern int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, struct fixed_phy_status *)); @@ -40,9 +40,8 @@ static inline struct phy_device *fixed_phy_register(unsigned int irq, { return ERR_PTR(-ENODEV); } -static inline int fixed_phy_del(int phy_addr) +static inline void fixed_phy_unregister(struct phy_device *phydev) { - return -ENODEV; } static inline int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, -- cgit v1.2.3 From 71327a4e7d997276d49db92fd3d30008389ee6d5 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Sun, 13 Mar 2016 16:21:32 -0400 Subject: net: dsa: rename port_*_bridge routines Rename DSA port_join_bridge and port_leave_bridge routines to respectively port_bridge_join and port_bridge_leave in order to respect an implicit Port::Bridge namespace. Signed-off-by: Vivien Didelot Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/dsa.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 26c0a3fa009a..004e034184c1 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -296,9 +296,9 @@ struct dsa_switch_driver { /* * Bridge integration */ - int (*port_join_bridge)(struct dsa_switch *ds, int port, + int (*port_bridge_join)(struct dsa_switch *ds, int port, struct net_device *bridge); - int (*port_leave_bridge)(struct dsa_switch *ds, int port); + int (*port_bridge_leave)(struct dsa_switch *ds, int port); int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); -- cgit v1.2.3 From 16bfa7024eba5e36aff38ba62086b9027373007d Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Sun, 13 Mar 2016 16:21:33 -0400 Subject: net: dsa: make port_bridge_leave return void netdev_upper_dev_unlink() which notifies NETDEV_CHANGEUPPER, returns void, as well as del_nbp(). So there's no advantage to catch an eventual error from the port_bridge_leave routine at the DSA level. Make this routine void for the DSA layer and its existing drivers. Signed-off-by: Vivien Didelot Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 004e034184c1..6463bb2863ac 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -298,7 +298,7 @@ struct dsa_switch_driver { */ int (*port_bridge_join)(struct dsa_switch *ds, int port, struct net_device *bridge); - int (*port_bridge_leave)(struct dsa_switch *ds, int port); + void (*port_bridge_leave)(struct dsa_switch *ds, int port); int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); -- cgit v1.2.3 From bfa3f9d7f3b349acea8982d2248e33a0ed84c687 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 10 Mar 2016 10:54:16 -0800 Subject: netfilter: Remove IP_CT_NEW_REPLY definition. Remove the definition of IP_CT_NEW_REPLY from the kernel as it does not make sense. This allows the definition of IP_CT_NUMBER to be simplified as well. Signed-off-by: Jarno Rajahalme Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 319f47128db8..6d074d14ee27 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -20,9 +20,15 @@ enum ip_conntrack_info { IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY, IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY, - IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY, - /* Number of distinct IP_CT types (no NEW in reply dirn). */ - IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1 + /* No NEW in reply direction. */ + + /* Number of distinct IP_CT types. */ + IP_CT_NUMBER, + + /* only for userspace compatibility */ +#ifndef __KERNEL__ + IP_CT_NEW_REPLY = IP_CT_NUMBER, +#endif }; #define NF_CT_STATE_INVALID_BIT (1 << 0) -- cgit v1.2.3 From 05752523e56502cd9975aec0a2ded465d51a71f3 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 10 Mar 2016 10:54:23 -0800 Subject: openvswitch: Interface with NAT. Extend OVS conntrack interface to cover NAT. New nested OVS_CT_ATTR_NAT attribute may be used to include NAT with a CT action. A bare OVS_CT_ATTR_NAT only mangles existing and expected connections. If OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested attributes, new (non-committed/non-confirmed) connections are mangled according to the rest of the nested attributes. The corresponding OVS userspace patch series includes test cases (in tests/system-traffic.at) that also serve as example uses. This work extends on a branch by Thomas Graf at https://github.com/tgraf/ovs/tree/nat. Signed-off-by: Jarno Rajahalme Acked-by: Thomas Graf Acked-by: Joe Stringer Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/openvswitch.h | 49 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index a27222d5b413..616d04761730 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -454,6 +454,14 @@ struct ovs_key_ct_labels { #define OVS_CS_F_REPLY_DIR 0x08 /* Flow is in the reply direction. */ #define OVS_CS_F_INVALID 0x10 /* Could not track connection. */ #define OVS_CS_F_TRACKED 0x20 /* Conntrack has occurred. */ +#define OVS_CS_F_SRC_NAT 0x40 /* Packet's source address/port was + * mangled by NAT. + */ +#define OVS_CS_F_DST_NAT 0x80 /* Packet's destination address/port + * was mangled by NAT. + */ + +#define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT) /** * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands. @@ -632,6 +640,8 @@ struct ovs_action_hash { * mask. For each bit set in the mask, the corresponding bit in the value is * copied to the connection tracking label field in the connection. * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG. + * @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address + * translation (NAT) on the packet. */ enum ovs_ct_attr { OVS_CT_ATTR_UNSPEC, @@ -641,11 +651,50 @@ enum ovs_ct_attr { OVS_CT_ATTR_LABELS, /* labels to associate with this connection. */ OVS_CT_ATTR_HELPER, /* netlink helper to assist detection of related connections. */ + OVS_CT_ATTR_NAT, /* Nested OVS_NAT_ATTR_* */ __OVS_CT_ATTR_MAX }; #define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1) +/** + * enum ovs_nat_attr - Attributes for %OVS_CT_ATTR_NAT. + * + * @OVS_NAT_ATTR_SRC: Flag for Source NAT (mangle source address/port). + * @OVS_NAT_ATTR_DST: Flag for Destination NAT (mangle destination + * address/port). Only one of (@OVS_NAT_ATTR_SRC, @OVS_NAT_ATTR_DST) may be + * specified. Effective only for packets for ct_state NEW connections. + * Packets of committed connections are mangled by the NAT action according to + * the committed NAT type regardless of the flags specified. As a corollary, a + * NAT action without a NAT type flag will only mangle packets of committed + * connections. The following NAT attributes only apply for NEW + * (non-committed) connections, and they may be included only when the CT + * action has the @OVS_CT_ATTR_COMMIT flag and either @OVS_NAT_ATTR_SRC or + * @OVS_NAT_ATTR_DST is also included. + * @OVS_NAT_ATTR_IP_MIN: struct in_addr or struct in6_addr + * @OVS_NAT_ATTR_IP_MAX: struct in_addr or struct in6_addr + * @OVS_NAT_ATTR_PROTO_MIN: u16 L4 protocol specific lower boundary (port) + * @OVS_NAT_ATTR_PROTO_MAX: u16 L4 protocol specific upper boundary (port) + * @OVS_NAT_ATTR_PERSISTENT: Flag for persistent IP mapping across reboots + * @OVS_NAT_ATTR_PROTO_HASH: Flag for pseudo random L4 port mapping (MD5) + * @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping + */ +enum ovs_nat_attr { + OVS_NAT_ATTR_UNSPEC, + OVS_NAT_ATTR_SRC, + OVS_NAT_ATTR_DST, + OVS_NAT_ATTR_IP_MIN, + OVS_NAT_ATTR_IP_MAX, + OVS_NAT_ATTR_PROTO_MIN, + OVS_NAT_ATTR_PROTO_MAX, + OVS_NAT_ATTR_PERSISTENT, + OVS_NAT_ATTR_PROTO_HASH, + OVS_NAT_ATTR_PROTO_RANDOM, + __OVS_NAT_ATTR_MAX, +}; + +#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1) + /** * enum ovs_action_attr - Action types. * -- cgit v1.2.3 From fb781c8e2a370d67acf7b8a8826e6f5e3ae1d7c6 Mon Sep 17 00:00:00 2001 From: Xing Zheng Date: Mon, 14 Mar 2016 16:01:56 +0800 Subject: clk: rockchip: add node-id for rk3036 emac hclk Add the node-id for the emac hclk to the binding header. Signed-off-by: Xing Zheng Signed-off-by: Caesar Wang Cc: Xing Zheng Cc: Michael Turquette Cc: Heiko Stuebner Cc: Stephen Boyd Cc: linux-clk@vger.kernel.org Cc: linux-rockchip@lists.infradead.org Signed-off-by: David S. Miller --- include/dt-bindings/clock/rk3036-cru.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/clock/rk3036-cru.h b/include/dt-bindings/clock/rk3036-cru.h index ebc7a7b43f52..339659115695 100644 --- a/include/dt-bindings/clock/rk3036-cru.h +++ b/include/dt-bindings/clock/rk3036-cru.h @@ -92,6 +92,7 @@ #define HCLK_SDMMC 456 #define HCLK_SDIO 457 #define HCLK_EMMC 459 +#define HCLK_MAC 460 #define HCLK_I2S 462 #define HCLK_LCDC 465 #define HCLK_ROM 467 -- cgit v1.2.3 From f7e180222b973a0b363564b281a314276cb2b594 Mon Sep 17 00:00:00 2001 From: Xing Zheng Date: Mon, 14 Mar 2016 16:01:58 +0800 Subject: clk: rockchip: add clock-id for rk3036 emac pll source clock Suitable PLLs for the emac on the rk3036 are difficult to find and one of them is the (continuously changing) APLL. So in most cases it will be necessary to select a PLL manually. So add a clock-id for it. Signed-off-by: Xing Zheng Signed-off-by: Caesar Wang Cc: Xing Zheng Cc: Michael Turquette Cc: Heiko Stuebner Cc: Stephen Boyd Cc: linux-clk@vger.kernel.org Cc: linux-rockchip@lists.infradead.org Signed-off-by: David S. Miller --- include/dt-bindings/clock/rk3036-cru.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/clock/rk3036-cru.h b/include/dt-bindings/clock/rk3036-cru.h index 339659115695..de44109a3a04 100644 --- a/include/dt-bindings/clock/rk3036-cru.h +++ b/include/dt-bindings/clock/rk3036-cru.h @@ -54,6 +54,7 @@ #define SCLK_PVTM_VIDEO 125 #define SCLK_MAC 151 #define SCLK_MACREF 152 +#define SCLK_MACPLL 153 #define SCLK_SFC 160 /* aclk gates */ -- cgit v1.2.3 From 808c1b697c3c4dd2a7132882424c390b0d0acfb9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Mar 2016 01:42:50 +0100 Subject: bpf, dst: add and use dst_tclassid helper We can just add a small helper dst_tclassid() for retrieving the dst->tclassid value. It makes the code a bit better in that we can get rid of the ifdef from filter.c by moving this into the header. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/dst.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index c7329dcd90cc..5c98443c1c9e 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -398,6 +398,18 @@ static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, __skb_tunnel_rx(skb, dev, net); } +static inline u32 dst_tclassid(const struct sk_buff *skb) +{ +#ifdef CONFIG_IP_ROUTE_CLASSID + const struct dst_entry *dst; + + dst = skb_dst(skb); + if (dst) + return dst->tclassid; +#endif + return 0; +} + int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static inline int dst_discard(struct sk_buff *skb) { -- cgit v1.2.3 From fca5fdf67de9e092fda23c9eb059ba968e7b5267 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Mar 2016 01:42:51 +0100 Subject: ip_tunnels, bpf: define IP_TUNNEL_OPTS_MAX and use it eBPF defines this as BPF_TUNLEN_MAX and OVS just uses the hard-coded value inside struct sw_flow_key. Thus, add and use IP_TUNNEL_OPTS_MAX for this, which makes the code a bit more generic and allows to remove BPF_TUNLEN_MAX from eBPF code. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 5dc2e454f866..c35dda9ec991 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -7,6 +7,8 @@ #include #include #include +#include + #include #include #include @@ -57,6 +59,11 @@ struct ip_tunnel_key { #define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ #define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ +/* Maximum tunnel options length. */ +#define IP_TUNNEL_OPTS_MAX \ + GENMASK((FIELD_SIZEOF(struct ip_tunnel_info, \ + options_len) * BITS_PER_BYTE) - 1, 0) + struct ip_tunnel_info { struct ip_tunnel_key key; #ifdef CONFIG_DST_CACHE -- cgit v1.2.3 From 93e68cd6115f67d8363c94dae8206af36f6d3b00 Mon Sep 17 00:00:00 2001 From: Zhang Shengju Date: Wed, 16 Mar 2016 09:12:46 +0000 Subject: net: fix a comment typo Fix a comment typo. Signed-off-by: Zhang Shengju Signed-off-by: David S. Miller --- include/uapi/linux/if.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index 9cf2394f0bcf..f80277569f24 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -37,7 +37,7 @@ * are shared for all types of net_devices. The sysfs entries are available * via /sys/class/net//flags. Flags which can be toggled through sysfs * are annotated below, note that only a few flags can be toggled and some - * other flags are always always preserved from the original net_device flags + * other flags are always preserved from the original net_device flags * even if you try to set them via sysfs. Flags which are always preserved * are kept under the flag grouping @IFF_VOLATILE. Flags which are volatile * are annotated below as such. -- cgit v1.2.3 From fe30937b65354c7fec244caebbdaae68e28ca797 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Mar 2016 17:23:36 -0700 Subject: bonding: fix bond_get_stats() bond_get_stats() can be called from rtnetlink (with RTNL held) or from /proc/net/dev seq handler (with RCU held) The logic added in commit 5f0c5f73e5ef ("bonding: make global bonding stats more reliable") kind of assumed only one cpu could run there. If multiple threads are reading /proc/net/dev, stats can be really messed up after a while. A second problem is that some fields are 32bit, so we need to properly handle the wrap around problem. Given that RTNL is not always held, we need to use bond_for_each_slave_rcu(). Fixes: 5f0c5f73e5ef ("bonding: make global bonding stats more reliable") Signed-off-by: Eric Dumazet Cc: Andy Gospodarek Cc: Jay Vosburgh Cc: Veaceslav Falico Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/bonding.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bonding.h b/include/net/bonding.h index ee6c52053aa3..791800ddd6d9 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -215,6 +215,7 @@ struct bonding { * ALB mode (6) - to sync the use and modifications of its hash table */ spinlock_t mode_lock; + spinlock_t stats_lock; u8 send_peer_notif; u8 igmp_retrans; #ifdef CONFIG_PROC_FS -- cgit v1.2.3