From 7a7d3e4c031b969c3b570787fc9fdb605abda03e Mon Sep 17 00:00:00 2001 From: Simon Dinkin Date: Thu, 31 Aug 2017 13:09:36 +0300 Subject: mac80211: fix incorrect assignment of reassoc value this fix minor issue in the log message. in ieee80211_rx_mgmt_assoc_resp function, when assigning the reassoc value from the mgmt frame control: ieee80211_is_reassoc_resp function need to be used, instead of ieee80211_is_reassoc_req function. Signed-off-by: Simon Dinkin Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b588e593b0ec..3b8e2709d8de 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3155,7 +3155,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, if (len < 24 + 6) return; - reassoc = ieee80211_is_reassoc_req(mgmt->frame_control); + reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control); capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); aid = le16_to_cpu(mgmt->u.assoc_resp.aid); -- cgit v1.2.3 From 53168215909281a09d3afc6fb51a9d4f81f74d39 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 22 Jun 2017 12:20:30 +0200 Subject: mac80211: fix VLAN handling with TXQs With TXQs, the AP_VLAN interfaces are resolved to their owner AP interface when enqueuing the frame, which makes sense since the frame really goes out on that as far as the driver is concerned. However, this introduces a problem: frames to be encrypted with a VLAN-specific GTK will now be encrypted with the AP GTK, since the information about which virtual interface to use to select the key is taken from the TXQ. Fix this by preserving info->control.vif and using that in the dequeue function. This now requires doing the driver-mapping in the dequeue as well. Since there's no way to filter the frames that are sitting on a TXQ, drop all frames, which may affect other interfaces, when an AP_VLAN is removed. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 17 +++++++++++++++-- net/mac80211/tx.c | 36 +++++++++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 9228ac73c429..44399322f356 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -792,6 +792,7 @@ static int ieee80211_open(struct net_device *dev) static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down) { + struct ieee80211_sub_if_data *txq_sdata = sdata; struct ieee80211_local *local = sdata->local; struct fq *fq = &local->fq; unsigned long flags; @@ -937,6 +938,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: + txq_sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); + mutex_lock(&local->mtx); list_del(&sdata->u.vlan.list); mutex_unlock(&local->mtx); @@ -1007,8 +1011,17 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); - if (sdata->vif.txq) { - struct txq_info *txqi = to_txq_info(sdata->vif.txq); + if (txq_sdata->vif.txq) { + struct txq_info *txqi = to_txq_info(txq_sdata->vif.txq); + + /* + * FIXME FIXME + * + * We really shouldn't purge the *entire* txqi since that + * contains frames for the other AP_VLANs (and possibly + * the AP itself) as well, but there's no API in FQ now + * to be able to filter. + */ spin_lock_bh(&fq->lock); ieee80211_txq_purge(local, txqi); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 8858f4f185e9..94826680cf2b 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1276,11 +1276,6 @@ static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); } -static void ieee80211_set_skb_vif(struct sk_buff *skb, struct txq_info *txqi) -{ - IEEE80211_SKB_CB(skb)->control.vif = txqi->txq.vif; -} - static u32 codel_skb_len_func(const struct sk_buff *skb) { return skb->len; @@ -3414,6 +3409,7 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_tx_info *info; struct ieee80211_tx_data tx; ieee80211_tx_result r; + struct ieee80211_vif *vif; spin_lock_bh(&fq->lock); @@ -3430,8 +3426,6 @@ begin: if (!skb) goto out; - ieee80211_set_skb_vif(skb, txqi); - hdr = (struct ieee80211_hdr *)skb->data; info = IEEE80211_SKB_CB(skb); @@ -3488,6 +3482,34 @@ begin: } } + switch (tx.sdata->vif.type) { + case NL80211_IFTYPE_MONITOR: + if (tx.sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { + vif = &tx.sdata->vif; + break; + } + tx.sdata = rcu_dereference(local->monitor_sdata); + if (tx.sdata) { + vif = &tx.sdata->vif; + info->hw_queue = + vif->hw_queue[skb_get_queue_mapping(skb)]; + } else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) { + ieee80211_free_txskb(&local->hw, skb); + goto begin; + } else { + vif = NULL; + } + break; + case NL80211_IFTYPE_AP_VLAN: + tx.sdata = container_of(tx.sdata->bss, + struct ieee80211_sub_if_data, u.ap); + /* fall through */ + default: + vif = &tx.sdata->vif; + break; + } + + IEEE80211_SKB_CB(skb)->control.vif = vif; out: spin_unlock_bh(&fq->lock); -- cgit v1.2.3 From d81b0fd0e7b76b05873299ae2dd310127b13613b Mon Sep 17 00:00:00 2001 From: Sharon Dvir Date: Sat, 5 Aug 2017 11:44:30 +0300 Subject: mac80211: shorten debug prints using ht_dbg() to avoid warning Invoking ht_dbg() with too long of a string will print a warning. Shorten the messages while retaining the printed patameters. Signed-off-by: Sharon Dvir Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index cbd48762256c..420486b5a1d9 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -436,7 +436,7 @@ static void sta_addba_resp_timer_expired(unsigned long data) test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) { rcu_read_unlock(); ht_dbg(sta->sdata, - "timer expired on %pM tid %d but we are not (or no longer) expecting addBA response there\n", + "timer expired on %pM tid %d not expecting addBA response\n", sta->sta.addr, tid); return; } @@ -639,7 +639,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, time_before(jiffies, sta->ampdu_mlme.last_addba_req_time[tid] + HT_AGG_RETRIES_PERIOD)) { ht_dbg(sdata, - "BA request denied - waiting a grace period after %d failed requests on %pM tid %u\n", + "BA request denied - %d failed requests on %pM tid %u\n", sta->ampdu_mlme.addba_req_num[tid], sta->sta.addr, tid); ret = -EBUSY; goto err_unlock_sta; -- cgit v1.2.3 From b44eebea181a36378bea8f27e7f9b4175bfad683 Mon Sep 17 00:00:00 2001 From: Liad Kaufman Date: Sat, 5 Aug 2017 11:44:29 +0300 Subject: mac80211: add MESH IE in the correct order VHT MESH support was added, but the order of the IEs wasn't enforced. Fix that. Signed-off-by: Liad Kaufman Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 259698de569f..6aef6793d052 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1436,7 +1436,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, WLAN_EID_SSID_LIST, WLAN_EID_CHANNEL_USAGE, WLAN_EID_INTERWORKING, - /* mesh ID can't happen here */ + WLAN_EID_MESH_ID, /* 60 GHz can't happen here right now */ }; noffset = ieee80211_ie_split(ie, ie_len, -- cgit v1.2.3 From 89e9bfc4ee859ad2a477f10aa2d5c37377242296 Mon Sep 17 00:00:00 2001 From: Chunho Lee Date: Fri, 7 Jul 2017 17:03:14 -0700 Subject: mac80211: Fix null pointer dereference with iTXQ support This change adds null pointer check before dereferencing pointer dev on netif_tx_start_all_queues() when an interface is added. With iTXQ support, netif_tx_start_all_queues() is always called while an interface is added. however, the netdev queues are not associated and dev is null when the interface is either NL80211_IFTYPE_P2P_DEVICE or NL80211_IFTYPE_NAN. Signed-off-by: Chunho Lee Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 44399322f356..f75029abf728 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -731,7 +731,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) sdata->vif.type == NL80211_IFTYPE_AP_VLAN || local->ops->wake_tx_queue) { /* XXX: for AP_VLAN, actually track AP queues */ - netif_tx_start_all_queues(dev); + if (dev) + netif_tx_start_all_queues(dev); } else if (dev) { unsigned long flags; int n_acs = IEEE80211_NUM_ACS; -- cgit v1.2.3 From 979e1f08042b83152dfe3d76df10db31eb7edf98 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 22 Jun 2017 12:20:28 +0200 Subject: mac80211: agg-tx: call drv_wake_tx_queue in proper context Since drv_wake_tx_queue() is normally called in the TX path, which is already in an RCU critical section, we should call it the same way in the aggregation code path, so if the driver expects to be able to use RCU, it'll already be protected without having to enter a nested critical section. Additionally, disable soft-IRQs, since not doing so could cause issues in a driver that relies on them already being disabled like in the other path. Fixes: ba8c3d6f16a1 ("mac80211: add an intermediate software queue implementation") Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 420486b5a1d9..bef516ec47f9 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -226,7 +226,11 @@ ieee80211_agg_start_txq(struct sta_info *sta, int tid, bool enable) clear_bit(IEEE80211_TXQ_AMPDU, &txqi->flags); clear_bit(IEEE80211_TXQ_STOP, &txqi->flags); + local_bh_disable(); + rcu_read_lock(); drv_wake_tx_queue(sta->sdata->local, txqi); + rcu_read_unlock(); + local_bh_enable(); } /* -- cgit v1.2.3 From 6e46d8ce894374fc135c96a8d1057c6af1fef237 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 18 Aug 2017 15:33:57 +0300 Subject: mac80211: flush hw_roc_start work before cancelling the ROC When HW ROC is supported it is possible that after the HW notified that the ROC has started, the ROC was cancelled and another ROC was added while the hw_roc_start worker is waiting on the mutex (since cancelling the ROC and adding another one also holds the same mutex). As a result, the hw_roc_start worker will continue to run after the new ROC is added but before it is actually started by the HW. This may result in notifying userspace that the ROC has started before it actually does, or in case of management tx ROC, in an attempt to tx while not on the right channel. In addition, when the driver will notify mac80211 that the second ROC has started, mac80211 will warn that this ROC has already been notified. Fix this by flushing the hw_roc_start work before cancelling an ROC. Cc: stable@vger.kernel.org Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/offchannel.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index f8e7a8bbc618..faf4f6055000 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -707,6 +707,8 @@ static int ieee80211_cancel_roc(struct ieee80211_local *local, if (!cookie) return -ENOENT; + flush_work(&local->hw_roc_start); + mutex_lock(&local->mtx); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!mgmt_tx && roc->cookie != cookie) -- cgit v1.2.3 From ba83bfb1e86501d21077b570e10e443f1605be64 Mon Sep 17 00:00:00 2001 From: Igor Mitsyanko Date: Wed, 30 Aug 2017 13:52:25 -0700 Subject: nl80211: look for HT/VHT capabilities in beacon's tail There are no HT/VHT capabilities in cfg80211_ap_settings::beacon_ies, these should be looked for in beacon's tail instead. Fixes: 66cd794e3c30 ("nl80211: add HT/VHT capabilities to AP parameters") Signed-off-by: Igor Mitsyanko Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8ce85420ecb0..0df8023f480b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3791,8 +3791,8 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) { const struct cfg80211_beacon_data *bcn = ¶ms->beacon; - size_t ies_len = bcn->beacon_ies_len; - const u8 *ies = bcn->beacon_ies; + size_t ies_len = bcn->tail_len; + const u8 *ies = bcn->tail; const u8 *rates; const u8 *cap; -- cgit v1.2.3 From 4e0854a74f08e6a9d847f2c2cfae7b07c931d125 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Wed, 6 Sep 2017 13:45:40 +0300 Subject: cfg80211: honor NL80211_RRF_NO_HT40{MINUS,PLUS} Honor the NL80211_RRF_NO_HT40{MINUS,PLUS} flags in reg_process_ht_flags_channel. Not doing so leads can lead to a firmware assert in iwlwifi for example. Fixes: b0d7aa59592b ("cfg80211: allow wiphy specific regdomain management") Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/wireless/reg.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 5fae296a6a58..6e94f6934a0e 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -4,6 +4,7 @@ * Copyright 2007 Johannes Berg * Copyright 2008-2011 Luis R. Rodriguez * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2017 Intel Deutschland GmbH * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -1483,7 +1484,9 @@ static void reg_process_ht_flags_channel(struct wiphy *wiphy, { struct ieee80211_supported_band *sband = wiphy->bands[channel->band]; struct ieee80211_channel *channel_before = NULL, *channel_after = NULL; + const struct ieee80211_regdomain *regd; unsigned int i; + u32 flags; if (!is_ht40_allowed(channel)) { channel->flags |= IEEE80211_CHAN_NO_HT40; @@ -1503,17 +1506,30 @@ static void reg_process_ht_flags_channel(struct wiphy *wiphy, channel_after = c; } + flags = 0; + regd = get_wiphy_regdom(wiphy); + if (regd) { + const struct ieee80211_reg_rule *reg_rule = + freq_reg_info_regd(MHZ_TO_KHZ(channel->center_freq), + regd, MHZ_TO_KHZ(20)); + + if (!IS_ERR(reg_rule)) + flags = reg_rule->flags; + } + /* * Please note that this assumes target bandwidth is 20 MHz, * if that ever changes we also need to change the below logic * to include that as well. */ - if (!is_ht40_allowed(channel_before)) + if (!is_ht40_allowed(channel_before) || + flags & NL80211_RRF_NO_HT40MINUS) channel->flags |= IEEE80211_CHAN_NO_HT40MINUS; else channel->flags &= ~IEEE80211_CHAN_NO_HT40MINUS; - if (!is_ht40_allowed(channel_after)) + if (!is_ht40_allowed(channel_after) || + flags & NL80211_RRF_NO_HT40PLUS) channel->flags |= IEEE80211_CHAN_NO_HT40PLUS; else channel->flags &= ~IEEE80211_CHAN_NO_HT40PLUS; -- cgit v1.2.3 From 98e93e968e4947cd71c2eb69e323682daa453ee7 Mon Sep 17 00:00:00 2001 From: Ilan peer Date: Wed, 6 Sep 2017 17:32:40 +0300 Subject: mac80211: Complete ampdu work schedule during session tear down Commit 7a7c0a6438b8 ("mac80211: fix TX aggregation start/stop callback race") added a cancellation of the ampdu work after the loop that stopped the Tx and Rx BA sessions. However, in some cases, e.g., during HW reconfig, the low level driver might call mac80211 APIs to complete the stopping of the BA sessions, which would queue the ampdu work to handle the actual completion. This work needs to be performed as otherwise mac80211 data structures would not be properly synced. Fix this by checking if BA session STOP_CB bit is set after the BA session cancellation and properly clean the session. Signed-off-by: Ilan Peer [Johannes: the work isn't flushed because that could do other things we don't want, and the locking situation isn't clear] Signed-off-by: Johannes Berg --- net/mac80211/ht.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net') diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index c92df492e898..4cba7fca10d4 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -300,6 +300,24 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, /* stopping might queue the work again - so cancel only afterwards */ cancel_work_sync(&sta->ampdu_mlme.work); + + /* + * In case the tear down is part of a reconfigure due to HW restart + * request, it is possible that the low level driver requested to stop + * the BA session, so handle it to properly clean tid_tx data. + */ + mutex_lock(&sta->ampdu_mlme.mtx); + for (i = 0; i < IEEE80211_NUM_TIDS; i++) { + struct tid_ampdu_tx *tid_tx = + rcu_dereference_protected_tid_tx(sta, i); + + if (!tid_tx) + continue; + + if (test_and_clear_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state)) + ieee80211_stop_tx_ba_cb(sta, i, tid_tx); + } + mutex_unlock(&sta->ampdu_mlme.mtx); } void ieee80211_ba_session_work(struct work_struct *work) -- cgit v1.2.3 From bde59c475e0883e4c4294bcd9b9c7e08ae18c828 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 6 Sep 2017 15:01:42 +0200 Subject: mac80211: fix deadlock in driver-managed RX BA session start When an RX BA session is started by the driver, and it has to tell mac80211 about it, the corresponding bit in tid_rx_manage_offl gets set and the BA session work is scheduled. Upon testing this bit, it will call __ieee80211_start_rx_ba_session(), thus deadlocking as it already holds the ampdu_mlme.mtx, which that acquires again. Fix this by adding ___ieee80211_start_rx_ba_session(), a version of the function that requires the mutex already held. Cc: stable@vger.kernel.org Fixes: 699cb58c8a52 ("mac80211: manage RX BA session offload without SKB queue") Reported-by: Matteo Croce Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 32 +++++++++++++++++++++----------- net/mac80211/ht.c | 6 +++--- net/mac80211/ieee80211_i.h | 4 ++++ 3 files changed, 28 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 2b36eff5d97e..2849a1fc41c5 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -245,10 +245,10 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d ieee80211_tx_skb(sdata, skb); } -void __ieee80211_start_rx_ba_session(struct sta_info *sta, - u8 dialog_token, u16 timeout, - u16 start_seq_num, u16 ba_policy, u16 tid, - u16 buf_size, bool tx, bool auto_seq) +void ___ieee80211_start_rx_ba_session(struct sta_info *sta, + u8 dialog_token, u16 timeout, + u16 start_seq_num, u16 ba_policy, u16 tid, + u16 buf_size, bool tx, bool auto_seq) { struct ieee80211_local *local = sta->sdata->local; struct tid_ampdu_rx *tid_agg_rx; @@ -267,7 +267,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, ht_dbg(sta->sdata, "STA %pM requests BA session on unsupported tid %d\n", sta->sta.addr, tid); - goto end_no_lock; + goto end; } if (!sta->sta.ht_cap.ht_supported) { @@ -275,14 +275,14 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, "STA %pM erroneously requests BA session on tid %d w/o QoS\n", sta->sta.addr, tid); /* send a response anyway, it's an error case if we get here */ - goto end_no_lock; + goto end; } if (test_sta_flag(sta, WLAN_STA_BLOCK_BA)) { ht_dbg(sta->sdata, "Suspend in progress - Denying ADDBA request (%pM tid %d)\n", sta->sta.addr, tid); - goto end_no_lock; + goto end; } /* sanity check for incoming parameters: @@ -296,7 +296,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, ht_dbg_ratelimited(sta->sdata, "AddBA Req with bad params from %pM on tid %u. policy %d, buffer size %d\n", sta->sta.addr, tid, ba_policy, buf_size); - goto end_no_lock; + goto end; } /* determine default buffer size */ if (buf_size == 0) @@ -311,7 +311,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, buf_size, sta->sta.addr); /* examine state machine */ - mutex_lock(&sta->ampdu_mlme.mtx); + lockdep_assert_held(&sta->ampdu_mlme.mtx); if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) { if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) { @@ -415,15 +415,25 @@ end: __clear_bit(tid, sta->ampdu_mlme.unexpected_agg); sta->ampdu_mlme.tid_rx_token[tid] = dialog_token; } - mutex_unlock(&sta->ampdu_mlme.mtx); -end_no_lock: if (tx) ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid, dialog_token, status, 1, buf_size, timeout); } +void __ieee80211_start_rx_ba_session(struct sta_info *sta, + u8 dialog_token, u16 timeout, + u16 start_seq_num, u16 ba_policy, u16 tid, + u16 buf_size, bool tx, bool auto_seq) +{ + mutex_lock(&sta->ampdu_mlme.mtx); + ___ieee80211_start_rx_ba_session(sta, dialog_token, timeout, + start_seq_num, ba_policy, tid, + buf_size, tx, auto_seq); + mutex_unlock(&sta->ampdu_mlme.mtx); +} + void ieee80211_process_addba_request(struct ieee80211_local *local, struct sta_info *sta, struct ieee80211_mgmt *mgmt, diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 4cba7fca10d4..d6d0b4201e40 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -351,9 +351,9 @@ void ieee80211_ba_session_work(struct work_struct *work) if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl)) - __ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, - IEEE80211_MAX_AMPDU_BUF, - false, true); + ___ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, + IEEE80211_MAX_AMPDU_BUF, + false, true); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, sta->ampdu_mlme.tid_rx_manage_offl)) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 2197c62a0a6e..9675814f64db 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1760,6 +1760,10 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, bool auto_seq); +void ___ieee80211_start_rx_ba_session(struct sta_info *sta, + u8 dialog_token, u16 timeout, + u16 start_seq_num, u16 ba_policy, u16 tid, + u16 buf_size, bool tx, bool auto_seq); void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3 From 39ad1297a2084e0724da73d9eda2ceb9573a5d6c Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Mon, 4 Sep 2017 14:21:12 +0800 Subject: sched: Use __qdisc_drop instead of kfree_skb in sch_prio and sch_qfq The commit 520ac30f4551 ("net_sched: drop packets after root qdisc lock is released) made a big change of tc for performance. There are two points left in sch_prio and sch_qfq which are not changed with that commit. Now enhance them now with __qdisc_drop. Signed-off-by: Gao Feng Signed-off-by: David S. Miller --- net/sched/sch_prio.c | 2 +- net/sched/sch_qfq.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index f31b28f788c0..2dd6c68ae91e 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -80,7 +80,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return ret; } #endif diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index cd661a7f81e6..6ddfd4991108 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1215,7 +1215,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); - kfree_skb(skb); + __qdisc_drop(skb, to_free); return err; } pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); -- cgit v1.2.3 From be82485fbcbb1cf11b13e7356231c72fdf21241c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Sep 2017 11:47:12 +0800 Subject: netlink: fix an use-after-free issue for nlk groups ChunYu found a netlink use-after-free issue by syzkaller: [28448.842981] BUG: KASAN: use-after-free in __nla_put+0x37/0x40 at addr ffff8807185e2378 [28448.969918] Call Trace: [...] [28449.117207] __nla_put+0x37/0x40 [28449.132027] nla_put+0xf5/0x130 [28449.146261] sk_diag_fill.isra.4.constprop.5+0x5a0/0x750 [netlink_diag] [28449.176608] __netlink_diag_dump+0x25a/0x700 [netlink_diag] [28449.202215] netlink_diag_dump+0x176/0x240 [netlink_diag] [28449.226834] netlink_dump+0x488/0xbb0 [28449.298014] __netlink_dump_start+0x4e8/0x760 [28449.317924] netlink_diag_handler_dump+0x261/0x340 [netlink_diag] [28449.413414] sock_diag_rcv_msg+0x207/0x390 [28449.432409] netlink_rcv_skb+0x149/0x380 [28449.467647] sock_diag_rcv+0x2d/0x40 [28449.484362] netlink_unicast+0x562/0x7b0 [28449.564790] netlink_sendmsg+0xaa8/0xe60 [28449.661510] sock_sendmsg+0xcf/0x110 [28449.865631] __sys_sendmsg+0xf3/0x240 [28450.000964] SyS_sendmsg+0x32/0x50 [28450.016969] do_syscall_64+0x25c/0x6c0 [28450.154439] entry_SYSCALL64_slow_path+0x25/0x25 It was caused by no protection between nlk groups' free in netlink_release and nlk groups' accessing in sk_diag_dump_groups. The similar issue also exists in netlink_seq_show(). This patch is to defer nlk groups' free in deferred_put_nlk_sk. Reported-by: ChunYu Wang Acked-by: Florian Westphal Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5acee49db90b..94a61e602ee7 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -691,6 +691,9 @@ static void deferred_put_nlk_sk(struct rcu_head *head) struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu); struct sock *sk = &nlk->sk; + kfree(nlk->groups); + nlk->groups = NULL; + if (!refcount_dec_and_test(&sk->sk_refcnt)) return; @@ -769,9 +772,6 @@ static int netlink_release(struct socket *sock) netlink_table_ungrab(); } - kfree(nlk->groups); - nlk->groups = NULL; - local_bh_disable(); sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); local_bh_enable(); -- cgit v1.2.3 From f773608026ee1e75e2256e3ad625c222ff6f9305 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Sep 2017 11:53:29 +0800 Subject: netlink: access nlk groups safely in netlink bind and getname Now there is no lock protecting nlk ngroups/groups' accessing in netlink bind and getname. It's safe from nlk groups' setting in netlink_release, but not from netlink_realloc_groups called by netlink_setsockopt. netlink_lock_table is needed in both netlink bind and getname when accessing nlk groups. Acked-by: Florian Westphal Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 94a61e602ee7..327807731b44 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -955,7 +955,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, struct net *net = sock_net(sk); struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; - int err; + int err = 0; long unsigned int groups = nladdr->nl_groups; bool bound; @@ -983,6 +983,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, return -EINVAL; } + netlink_lock_table(); if (nlk->netlink_bind && groups) { int group; @@ -993,7 +994,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (!err) continue; netlink_undo_bind(group, groups, sk); - return err; + goto unlock; } } @@ -1006,12 +1007,13 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, netlink_autobind(sock); if (err) { netlink_undo_bind(nlk->ngroups, groups, sk); - return err; + goto unlock; } } if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) - return 0; + goto unlock; + netlink_unlock_table(); netlink_table_grab(); netlink_update_subscriptions(sk, nlk->subscriptions + @@ -1022,6 +1024,10 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, netlink_table_ungrab(); return 0; + +unlock: + netlink_unlock_table(); + return err; } static int netlink_connect(struct socket *sock, struct sockaddr *addr, @@ -1079,7 +1085,9 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, nladdr->nl_groups = netlink_group_mask(nlk->dst_group); } else { nladdr->nl_pid = nlk->portid; + netlink_lock_table(); nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; + netlink_unlock_table(); } return 0; } -- cgit v1.2.3 From 8e0deed92406d93ae0365cb8a6134db5721e7aca Mon Sep 17 00:00:00 2001 From: Kleber Sacilotto de Souza Date: Wed, 6 Sep 2017 11:08:06 +0200 Subject: tipc: remove unnecessary call to dev_net() The net device is already stored in the 'net' variable, so no need to call dev_net() again. Signed-off-by: Kleber Sacilotto de Souza Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bearer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index ac1d66d7e1fd..47ec121574ce 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -637,7 +637,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, break; case NETDEV_UNREGISTER: case NETDEV_CHANGENAME: - bearer_disable(dev_net(dev), b); + bearer_disable(net, b); break; } return NOTIFY_OK; -- cgit v1.2.3 From 80532384af4ccb6d6cc965fa9232aaa2c456362c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 6 Sep 2017 13:14:19 +0200 Subject: net: sched: fix memleak for chain zero There's a memleak happening for chain 0. The thing is, chain 0 needs to be always present, not created on demand. Therefore tcf_block_get upon creation of block calls the tcf_chain_create function directly. The chain is created with refcnt == 1, which is not correct in this case and causes the memleak. So move the refcnt increment into tcf_chain_get function even for the case when chain needs to be created. Reported-by: Jakub Kicinski Fixes: 5bc1701881e3 ("net: sched: introduce multichain support for filters") Signed-off-by: Jiri Pirko Tested-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/sched/cls_api.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ea6c65fd5fc5..c743f03cfebd 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -182,7 +182,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, list_add_tail(&chain->list, &block->chain_list); chain->block = block; chain->index = chain_index; - chain->refcnt = 1; + chain->refcnt = 0; return chain; } @@ -217,15 +217,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, struct tcf_chain *chain; list_for_each_entry(chain, &block->chain_list, list) { - if (chain->index == chain_index) { - chain->refcnt++; - return chain; - } + if (chain->index == chain_index) + goto incref; } - if (create) - return tcf_chain_create(block, chain_index); - else - return NULL; + chain = create ? tcf_chain_create(block, chain_index) : NULL; + +incref: + if (chain) + chain->refcnt++; + return chain; } EXPORT_SYMBOL(tcf_chain_get); -- cgit v1.2.3 From 5c25f30c93fdc5bf25e62101aeaae7a4f9b421b3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 5 Sep 2017 17:26:33 +0800 Subject: ip6_gre: update mtu properly in ip6gre_err Now when probessing ICMPV6_PKT_TOOBIG, ip6gre_err only subtracts the offset of gre header from mtu info. The expected mtu of gre device should also subtract gre header. Otherwise, the next packets still can't be sent out. Jianlin found this issue when using the topo: client(ip6gre)<---->(nic1)route(nic2)<----->(ip6gre)server and reducing nic2's mtu, then both tcp and sctp's performance with big size data became 0. This patch is to fix it by also subtracting grehdr (tun->tun_hlen) from mtu info when updating gre device's mtu in ip6gre_err(). It also needs to subtract ETH_HLEN if gre dev'type is ARPHRD_ETHER. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 67ff2aaf5dcb..b7a72d409334 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } break; case ICMPV6_PKT_TOOBIG: - mtu = be32_to_cpu(info) - offset; + mtu = be32_to_cpu(info) - offset - t->tun_hlen; + if (t->dev->type == ARPHRD_ETHER) + mtu -= ETH_HLEN; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; t->dev->mtu = mtu; -- cgit v1.2.3 From ca2c1418efe9f7fe37aa1f355efdf4eb293673ce Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 6 Sep 2017 14:44:36 +0200 Subject: udp: drop head states only when all skb references are gone After commit 0ddf3fb2c43d ("udp: preserve skb->dst if required for IP options processing") we clear the skb head state as soon as the skb carrying them is first processed. Since the same skb can be processed several times when MSG_PEEK is used, we can end up lacking the required head states, and eventually oopsing. Fix this clearing the skb head state only when processing the last skb reference. Reported-by: Eric Dumazet Fixes: 0ddf3fb2c43d ("udp: preserve skb->dst if required for IP options processing") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/skbuff.c | 9 +++------ net/ipv4/udp.c | 5 ++++- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 68065d7d383f..16982de649b9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -710,14 +710,11 @@ EXPORT_SYMBOL(consume_skb); * consume_stateless_skb - free an skbuff, assuming it is stateless * @skb: buffer to free * - * Works like consume_skb(), but this variant assumes that all the head - * states have been already dropped. + * Alike consume_skb(), but this variant assumes that this is the last + * skb reference and all the head states have been already dropped */ -void consume_stateless_skb(struct sk_buff *skb) +void __consume_stateless_skb(struct sk_buff *skb) { - if (!skb_unref(skb)) - return; - trace_consume_skb(skb); skb_release_data(skb); kfree_skbmem(skb); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index db1c9e78c83c..ef29df8648e4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1397,12 +1397,15 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) unlock_sock_fast(sk, slow); } + if (!skb_unref(skb)) + return; + /* In the more common cases we cleared the head states previously, * see __udp_queue_rcv_skb(). */ if (unlikely(udp_skb_has_head_state(skb))) skb_release_head_state(skb); - consume_stateless_skb(skb); + __consume_stateless_skb(skb); } EXPORT_SYMBOL_GPL(skb_consume_udp); -- cgit v1.2.3 From 126f760ca94dae77425695f9f9238b731de86e32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Bugge?= Date: Wed, 6 Sep 2017 18:35:51 +0200 Subject: rds: Fix incorrect statistics counting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In rds_send_xmit() there is logic to batch the sends. However, if another thread has acquired the lock and has incremented the send_gen, it is considered a race and we yield. The code incrementing the s_send_lock_queue_raced statistics counter did not count this event correctly. This commit counts the race condition correctly. Changes from v1: - Removed check for *someone_on_xmit()* - Fixed incorrect indentation Signed-off-by: HÃ¥kon Bugge Reviewed-by: Knut Omang Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 058a40743041..b52cdc8ae428 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -428,14 +428,18 @@ over_batch: * some work and we will skip our goto */ if (ret == 0) { + bool raced; + smp_mb(); + raced = send_gen != READ_ONCE(cp->cp_send_gen); + if ((test_bit(0, &conn->c_map_queued) || - !list_empty(&cp->cp_send_queue)) && - send_gen == READ_ONCE(cp->cp_send_gen)) { - rds_stats_inc(s_send_lock_queue_raced); + !list_empty(&cp->cp_send_queue)) && !raced) { if (batch_count < send_batch_count) goto restart; queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + } else if (raced) { + rds_stats_inc(s_send_lock_queue_raced); } } out: -- cgit v1.2.3 From 1cc4a018669f2fb18c10010f1a7ab3f6fb688cef Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 20 Aug 2017 13:38:07 +0800 Subject: netfilter: ipvs: fix the issue that sctp_conn_schedule drops non-INIT packet Commit 5e26b1b3abce ("ipvs: support scheduling inverse and icmp SCTP packets") changed to check packet type early. It introduced a side effect: if it's not a INIT packet, ports will be set as NULL, and the packet will be dropped later. It caused that sctp couldn't create connection when ipvs module is loaded and any scheduler is registered on server. Li Shuang reproduced it by running the cmds on sctp server: # ipvsadm -A -t 1.1.1.1:80 -s rr # ipvsadm -D -t 1.1.1.1:80 then the server could't work any more. This patch is to return 1 when it's not an INIT packet. It means ipvs will accept it without creating a conn for it, just like what it does for tcp. Fixes: 5e26b1b3abce ("ipvs: support scheduling inverse and icmp SCTP packets") Reported-by: Li Shuang Signed-off-by: Xin Long Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_proto_sctp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index e1efa446b305..81f08198b125 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -24,9 +24,12 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, if (sh) { sch = skb_header_pointer(skb, iph->len + sizeof(_sctph), sizeof(_schunkh), &_schunkh); - if (sch && (sch->type == SCTP_CID_INIT || - sysctl_sloppy_sctp(ipvs))) + if (sch) { + if (!(sysctl_sloppy_sctp(ipvs) || + sch->type == SCTP_CID_INIT)) + return 1; ports = &sh->source; + } } } else { ports = skb_header_pointer( -- cgit v1.2.3 From 68913a018f6082f8f90abb8ff9114435ef45dff7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 20 Aug 2017 13:38:08 +0800 Subject: netfilter: ipvs: do not create conn for ABORT packet in sctp_conn_schedule There's no reason for ipvs to create a conn for an ABORT packet even if sysctl_sloppy_sctp is set. This patch is to accept it without creating a conn, just as ipvs does for tcp's RST packet. Signed-off-by: Xin Long Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_proto_sctp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 81f08198b125..57c8ee66491e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -25,7 +25,8 @@ sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, sch = skb_header_pointer(skb, iph->len + sizeof(_sctph), sizeof(_schunkh), &_schunkh); if (sch) { - if (!(sysctl_sloppy_sctp(ipvs) || + if (sch->type == SCTP_CID_ABORT || + !(sysctl_sloppy_sctp(ipvs) || sch->type == SCTP_CID_INIT)) return 1; ports = &sh->source; -- cgit v1.2.3 From ba1cc08d9488c94cb8d94f545305688b72a2a300 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 8 Sep 2017 10:26:19 +0200 Subject: ipv6: fix memory leak with multiple tables during netns destruction fib6_net_exit only frees the main and local tables. If another table was created with fib6_alloc_table, we leak it when the netns is destroyed. Fix this in the same way ip_fib_net_exit cleans up tables, by walking through the whole hashtable of fib6_table's. We can get rid of the special cases for local and main, since they're also part of the hashtable. Reproducer: ip netns add x ip -net x -6 rule add from 6003:1::/64 table 100 ip netns del x Reported-by: Jianlin Shi Fixes: 58f09b78b730 ("[NETNS][IPV6] ip6_fib - make it per network namespace") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index a3b5c163325f..8280172c806c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -191,6 +191,12 @@ void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) } EXPORT_SYMBOL_GPL(rt6_free_pcpu); +static void fib6_free_table(struct fib6_table *table) +{ + inetpeer_invalidate_tree(&table->tb6_peers); + kfree(table); +} + static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; @@ -2022,15 +2028,22 @@ out_timer: static void fib6_net_exit(struct net *net) { + unsigned int i; + rt6_ifdown(net, NULL); del_timer_sync(&net->ipv6.ip6_fib_timer); -#ifdef CONFIG_IPV6_MULTIPLE_TABLES - inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers); - kfree(net->ipv6.fib6_local_tbl); -#endif - inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers); - kfree(net->ipv6.fib6_main_tbl); + for (i = 0; i < FIB_TABLE_HASHSZ; i++) { + struct hlist_head *head = &net->ipv6.fib_table_hash[i]; + struct hlist_node *tmp; + struct fib6_table *tb; + + hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { + hlist_del(&tb->tb6_hlist); + fib6_free_table(tb); + } + } + kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); -- cgit v1.2.3 From 75c2631468e8af554057246b2413e738dd96af3d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 31 Aug 2017 13:45:24 +0200 Subject: netfilter: nf_nat: don't bug when mapping already exists It seems preferrable to limp along if we have a conflicting mapping, its certainly better than a BUG(). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 40573aa6c133..dc3519cc7209 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -416,7 +416,9 @@ nf_nat_setup_info(struct nf_conn *ct, WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); - BUG_ON(nf_nat_initialized(ct, maniptype)); + + if (WARN_ON(nf_nat_initialized(ct, maniptype))) + return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior -- cgit v1.2.3 From a5d7a714569199f909cd60ff7074107bf15c7db4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 1 Sep 2017 22:41:03 +0200 Subject: netfilter: xtables: add scheduling opportunity in get_counters There are reports about spurious softlockups during iptables-restore, a backtrace i saw points at get_counters -- it uses a sequence lock and also has unbounded restart loop. Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 1 + net/ipv4/netfilter/ip_tables.c | 1 + net/ipv6/netfilter/ip6_tables.c | 1 + 3 files changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e04457198f93..9e2770fd00be 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -629,6 +629,7 @@ static void get_counters(const struct xt_table_info *t, ADD_COUNTER(counters[i], bcnt, pcnt); ++i; + cond_resched(); } } } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 576cba2b57e9..39286e543ee6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -776,6 +776,7 @@ get_counters(const struct xt_table_info *t, ADD_COUNTER(counters[i], bcnt, pcnt); ++i; /* macro does multi eval of i */ + cond_resched(); } } } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 54b1e75eded1..01bd3ee5ebc6 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -795,6 +795,7 @@ get_counters(const struct xt_table_info *t, ADD_COUNTER(counters[i], bcnt, pcnt); ++i; + cond_resched(); } } } -- cgit v1.2.3 From e1bf1687740ce1a3598a1c5e452b852ff2190682 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Sep 2017 14:39:51 +0200 Subject: netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable" This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1. It was not a good idea. The custom hash table was a much better fit for this purpose. A fast lookup is not essential, in fact for most cases there is no lookup at all because original tuple is not taken and can be used as-is. What needs to be fast is insertion and deletion. rhlist removal however requires a rhlist walk. We can have thousands of entries in such a list if source port/addresses are reused for multiple flows, if this happens removal requests are so expensive that deletions of a few thousand flows can take several seconds(!). The advantages that we got from rhashtable are: 1) table auto-sizing 2) multiple locks 1) would be nice to have, but it is not essential as we have at most one lookup per new flow, so even a million flows in the bysource table are not a problem compared to current deletion cost. 2) is easy to add to custom hash table. I tried to add hlist_node to rhlist to speed up rhltable_remove but this isn't doable without changing semantics. rhltable_remove_fast will check that the to-be-deleted object is part of the table and that requires a list walk that we want to avoid. Furthermore, using hlist_node increases size of struct rhlist_head, which in turn increases nf_conn size. Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821 Reported-by: Ivan Babrou Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 130 ++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 77 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index dc3519cc7209..f090419f5f97 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -30,19 +30,17 @@ #include #include +static DEFINE_SPINLOCK(nf_nat_lock); + static DEFINE_MUTEX(nf_nat_proto_mutex); static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] __read_mostly; static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly; -struct nf_nat_conn_key { - const struct net *net; - const struct nf_conntrack_tuple *tuple; - const struct nf_conntrack_zone *zone; -}; - -static struct rhltable nf_nat_bysource_table; +static struct hlist_head *nf_nat_bysource __read_mostly; +static unsigned int nf_nat_htable_size __read_mostly; +static unsigned int nf_nat_hash_rnd __read_mostly; inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -118,17 +116,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) EXPORT_SYMBOL(nf_xfrm_me_harder); #endif /* CONFIG_XFRM */ -static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed) +/* We keep an extra hash for each conntrack, for fast searching. */ +static unsigned int +hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) { - const struct nf_conntrack_tuple *t; - const struct nf_conn *ct = data; + unsigned int hash; + + get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); - t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* Original src, to ensure we map it consistently if poss. */ + hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), + tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); - seed ^= net_hash_mix(nf_ct_net(ct)); - return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32), - t->dst.protonum ^ seed); + return reciprocal_scale(hash, nf_nat_htable_size); } /* Is this tuple already taken? (not by us) */ @@ -184,28 +184,6 @@ same_src(const struct nf_conn *ct, t->src.u.all == tuple->src.u.all); } -static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct nf_nat_conn_key *key = arg->key; - const struct nf_conn *ct = obj; - - if (!same_src(ct, key->tuple) || - !net_eq(nf_ct_net(ct), key->net) || - !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL)) - return 1; - - return 0; -} - -static struct rhashtable_params nf_nat_bysource_params = { - .head_offset = offsetof(struct nf_conn, nat_bysource), - .obj_hashfn = nf_nat_bysource_hash, - .obj_cmpfn = nf_nat_bysource_cmp, - .nelem_hint = 256, - .min_size = 1024, -}; - /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, @@ -216,26 +194,22 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { + unsigned int h = hash_by_src(net, tuple); const struct nf_conn *ct; - struct nf_nat_conn_key key = { - .net = net, - .tuple = tuple, - .zone = zone - }; - struct rhlist_head *hl, *h; - - hl = rhltable_lookup(&nf_nat_bysource_table, &key, - nf_nat_bysource_params); - rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) { - nf_ct_invert_tuplepr(result, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - result->dst = tuple->dst; - - if (in_range(l3proto, l4proto, result, range)) - return 1; + hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { + if (same_src(ct, tuple) && + net_eq(net, nf_ct_net(ct)) && + nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { + /* Copy source part from reply tuple. */ + nf_ct_invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; + + if (in_range(l3proto, l4proto, result, range)) + return 1; + } } - return 0; } @@ -408,6 +382,7 @@ nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ @@ -449,19 +424,14 @@ nf_nat_setup_info(struct nf_conn *ct, } if (maniptype == NF_NAT_MANIP_SRC) { - struct nf_nat_conn_key key = { - .net = nf_ct_net(ct), - .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - .zone = nf_ct_zone(ct), - }; - int err; - - err = rhltable_insert_key(&nf_nat_bysource_table, - &key, - &ct->nat_bysource, - nf_nat_bysource_params); - if (err) - return NF_DROP; + unsigned int srchash; + + srchash = hash_by_src(net, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + spin_lock_bh(&nf_nat_lock); + hlist_add_head_rcu(&ct->nat_bysource, + &nf_nat_bysource[srchash]); + spin_unlock_bh(&nf_nat_lock); } /* It's done. */ @@ -570,8 +540,9 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * will delete entry from already-freed table. */ clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); - rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, - nf_nat_bysource_params); + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&ct->nat_bysource); + spin_unlock_bh(&nf_nat_lock); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. @@ -699,9 +670,11 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); /* No one using conntrack by the time this called. */ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { - if (ct->status & IPS_SRC_NAT_DONE) - rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, - nf_nat_bysource_params); + if (ct->status & IPS_SRC_NAT_DONE) { + spin_lock_bh(&nf_nat_lock); + hlist_del_rcu(&ct->nat_bysource); + spin_unlock_bh(&nf_nat_lock); + } } static struct nf_ct_ext_type nat_extend __read_mostly = { @@ -825,13 +798,16 @@ static int __init nf_nat_init(void) { int ret; - ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params); - if (ret) - return ret; + /* Leave them the same for the moment. */ + nf_nat_htable_size = nf_conntrack_htable_size; + + nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); + if (!nf_nat_bysource) + return -ENOMEM; ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { - rhltable_destroy(&nf_nat_bysource_table); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; } @@ -865,8 +841,8 @@ static void __exit nf_nat_cleanup(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); - - rhltable_destroy(&nf_nat_bysource_table); + synchronize_net(); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); } MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 8073e960a03bf7b5d5ebfc5ff18ac475e1688f46 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Sep 2017 14:39:52 +0200 Subject: netfilter: nat: use keyed locks no need to serialize on a single lock, we can partition the table and add/delete in parallel to different slots. This restores one of the advantages that got lost with the rhlist revert. Cc: Ivan Babrou Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index f090419f5f97..f393a7086025 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -30,7 +30,7 @@ #include #include -static DEFINE_SPINLOCK(nf_nat_lock); +static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] @@ -425,13 +425,15 @@ nf_nat_setup_info(struct nf_conn *ct, if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; + spinlock_t *lock; srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - spin_lock_bh(&nf_nat_lock); + lock = &nf_nat_locks[srchash % ARRAY_SIZE(nf_nat_locks)]; + spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); - spin_unlock_bh(&nf_nat_lock); + spin_unlock_bh(lock); } /* It's done. */ @@ -525,6 +527,16 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data) return i->status & IPS_NAT_MASK ? 1 : 0; } +static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) +{ + unsigned int h; + + h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + spin_lock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); + hlist_del_rcu(&ct->nat_bysource); + spin_unlock_bh(&nf_nat_locks[h % ARRAY_SIZE(nf_nat_locks)]); +} + static int nf_nat_proto_clean(struct nf_conn *ct, void *data) { if (nf_nat_proto_remove(ct, data)) @@ -540,9 +552,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * will delete entry from already-freed table. */ clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_lock); + __nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. @@ -670,11 +680,8 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); /* No one using conntrack by the time this called. */ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { - if (ct->status & IPS_SRC_NAT_DONE) { - spin_lock_bh(&nf_nat_lock); - hlist_del_rcu(&ct->nat_bysource); - spin_unlock_bh(&nf_nat_lock); - } + if (ct->status & IPS_SRC_NAT_DONE) + __nf_nat_cleanup_conntrack(ct); } static struct nf_ct_ext_type nat_extend __read_mostly = { @@ -796,10 +803,12 @@ static struct nf_ct_helper_expectfn follow_master_nat = { static int __init nf_nat_init(void) { - int ret; + int ret, i; /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; + if (nf_nat_htable_size < ARRAY_SIZE(nf_nat_locks)) + nf_nat_htable_size = ARRAY_SIZE(nf_nat_locks); nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) @@ -812,6 +821,9 @@ static int __init nf_nat_init(void) return ret; } + for (i = 0; i < ARRAY_SIZE(nf_nat_locks); i++) + spin_lock_init(&nf_nat_locks[i]); + nf_ct_helper_expectfn_register(&follow_master_nat); BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); -- cgit v1.2.3 From 74585d4f84379528347630253c42518c5002d2f9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Sep 2017 14:47:57 +0200 Subject: netfilter: core: remove erroneous warn_on kernel test robot reported: WARNING: CPU: 0 PID: 1244 at net/netfilter/core.c:218 __nf_hook_entries_try_shrink+0x49/0xcd [..] After allowing batching in nf_unregister_net_hooks its possible that an earlier call to __nf_hook_entries_try_shrink already compacted the list. If this happens we don't need to do anything. Fixes: d3ad2c17b4047 ("netfilter: core: batch nf_unregister_net_hooks synchronize_net calls") Reported-by: kernel test robot Signed-off-by: Florian Westphal Acked-by: Aaron Conole Signed-off-by: Pablo Neira Ayuso --- net/netfilter/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 04fe25abc5f6..52cd2901a097 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -215,7 +215,7 @@ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries __rcu **pp) if (skip == hook_entries) goto out_assign; - if (WARN_ON(skip == 0)) + if (skip == 0) return NULL; hook_entries -= skip; -- cgit v1.2.3 From 05d0eae7c1cf6bf7b19c02b3a97ff457b3317323 Mon Sep 17 00:00:00 2001 From: Zhizhou Tian Date: Fri, 8 Sep 2017 11:00:16 +0800 Subject: netfilter: xt_hashlimit: alloc hashtable with right size struct xt_byteslimit_htable used hlist_head, but memory allocation is done through sizeof(struct list_head). Signed-off-by: Zhizhou Tian Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 10d48234f5f4..962ea4a63d9f 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -279,7 +279,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, size = cfg->size; } else { size = (totalram_pages << PAGE_SHIFT) / 16384 / - sizeof(struct list_head); + sizeof(struct hlist_head); if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) size = 8192; if (size < 16) @@ -287,7 +287,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, } /* FIXME: don't use vmalloc() here or anywhere else -HW */ hinfo = vmalloc(sizeof(struct xt_hashlimit_htable) + - sizeof(struct list_head) * size); + sizeof(struct hlist_head) * size); if (hinfo == NULL) return -ENOMEM; *out_hinfo = hinfo; -- cgit v1.2.3 From 90c4ae4e2c1da9f1eaf846136861af43d4c1ff34 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Fri, 8 Sep 2017 01:38:58 -0400 Subject: netfilter: xt_hashlimit: fix build error caused by 64bit division 64bit division causes build/link errors on 32bit architectures. It prints out error messages like: ERROR: "__aeabi_uldivmod" [net/netfilter/xt_hashlimit.ko] undefined! The value of avg passed through by userspace in BYTE mode cannot exceed U32_MAX. Which means 64bit division in user2rate_bytes is unnecessary. To fix this I have changed the type of param 'user' to u32. Since anything greater than U32_MAX is an invalid input we error out in hashlimit_mt_check_common() when this is the case. Changes in v2: Making return type as u32 would cause an overflow for small values of 'user' (for example 2, 3 etc). To avoid this I bumped up 'r' to u64 again as well as the return type. This is OK since the variable that stores the result is u64. We still avoid 64bit division here since 'user' is u32. Fixes: bea74641e378 ("netfilter: xt_hashlimit: add rate match mode") Signed-off-by: Vishwanath Pai Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 962ea4a63d9f..5da8746f7b88 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -35,6 +35,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); @@ -527,12 +528,12 @@ static u64 user2rate(u64 user) } } -static u64 user2rate_bytes(u64 user) +static u64 user2rate_bytes(u32 user) { u64 r; - r = user ? 0xFFFFFFFFULL / user : 0xFFFFFFFFULL; - r = (r - 1) << 4; + r = user ? U32_MAX / user : U32_MAX; + r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT; return r; } @@ -588,7 +589,8 @@ static void rateinfo_init(struct dsthash_ent *dh, dh->rateinfo.prev_window = 0; dh->rateinfo.current_rate = 0; if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { - dh->rateinfo.rate = user2rate_bytes(hinfo->cfg.avg); + dh->rateinfo.rate = + user2rate_bytes((u32)hinfo->cfg.avg); if (hinfo->cfg.burst) dh->rateinfo.burst = hinfo->cfg.burst * dh->rateinfo.rate; @@ -870,7 +872,7 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, /* Check for overflow. */ if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) { - if (cfg->avg == 0) { + if (cfg->avg == 0 || cfg->avg > U32_MAX) { pr_info("hashlimit invalid rate\n"); return -ERANGE; } -- cgit v1.2.3 From 7906b00f5cd1cd484fced7fcda892176e3202c8a Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 8 Sep 2017 11:35:21 -0300 Subject: sctp: fix missing wake ups in some situations Commit fb586f25300f ("sctp: delay calls to sk_data_ready() as much as possible") minimized the number of wake ups that are triggered in case the association receives a packet with multiple data chunks on it and/or when io_events are enabled and then commit 0970f5b36659 ("sctp: signal sk_data_ready earlier on data chunks reception") moved the wake up to as soon as possible. It thus relies on the state machine running later to clean the flag that the event was already generated. The issue is that there are 2 call paths that calls sctp_ulpq_tail_event() outside of the state machine, causing the flag to linger and possibly omitting a needed wake up in the sequence. One of the call paths is when enabling SCTP_SENDER_DRY_EVENTS via setsockopt(SCTP_EVENTS), as noticed by Harald Welte. The other is when partial reliability triggers removal of chunks from the send queue when the application calls sendmsg(). This commit fixes it by not setting the flag in case the socket is not owned by the user, as it won't be cleaned later. This works for user-initiated calls and also for rx path processing. Fixes: fb586f25300f ("sctp: delay calls to sk_data_ready() as much as possible") Reported-by: Harald Welte Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/ulpqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 0225d62a869f..a71be33f3afe 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) sctp_ulpq_clear_pd(ulpq); if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) { - sp->data_ready_signalled = 1; + if (!sock_owned_by_user(sk)) + sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } return 1; -- cgit v1.2.3 From 1f3b359f1004bd34b7b0bad70b93e3c7af92a37b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Sep 2017 12:44:47 -0700 Subject: tcp: fix a request socket leak While the cited commit fixed a possible deadlock, it added a leak of the request socket, since reqsk_put() must be called if the BPF filter decided the ACK packet must be dropped. Fixes: d624d276d1dd ("tcp: fix possible deadlock in TCP stack vs BPF filter") Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 6 +++--- net/ipv6/tcp_ipv6.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a63486afa7a7..d9416b5162bc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1669,9 +1669,9 @@ process: */ sock_hold(sk); refcounted = true; - if (tcp_filter(sk, skb)) - goto discard_and_relse; - nsk = tcp_check_req(sk, skb, req, false); + nsk = NULL; + if (!tcp_filter(sk, skb)) + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); goto discard_and_relse; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 38f76d8b231e..64d94afa427f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1460,9 +1460,9 @@ process: } sock_hold(sk); refcounted = true; - if (tcp_filter(sk, skb)) - goto discard_and_relse; - nsk = tcp_check_req(sk, skb, req, false); + nsk = NULL; + if (!tcp_filter(sk, skb)) + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); goto discard_and_relse; -- cgit v1.2.3 From 32a805baf0fb70b6dbedefcd7249ac7f580f9e3b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Sep 2017 15:48:47 -0700 Subject: ipv6: fix typo in fib6_net_exit() IPv6 FIB should use FIB6_TABLE_HASHSZ, not FIB_TABLE_HASHSZ. Fixes: ba1cc08d9488 ("ipv6: fix memory leak with multiple tables during netns destruction") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 8280172c806c..e5308d7cbd75 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2033,7 +2033,7 @@ static void fib6_net_exit(struct net *net) rt6_ifdown(net, NULL); del_timer_sync(&net->ipv6.ip6_fib_timer); - for (i = 0; i < FIB_TABLE_HASHSZ; i++) { + for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; -- cgit v1.2.3 From 0f693f1995cf002432b70f43ce73f79bf8d0b6c9 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Thu, 7 Sep 2017 14:08:34 +0800 Subject: ip_tunnel: fix setting ttl and tos value in collect_md mode ttl and tos variables are declared and assigned, but are not used in iptunnel_xmit() function. Fixes: cfc7381b3002 ("ip_tunnel: add collect_md mode to IPIP tunnel") Cc: Alexei Starovoitov Signed-off-by: Haishuang Yan Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 129d1a3616f8..e1856bfa753d 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -618,8 +618,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) ip_rt_put(rt); goto tx_dropped; } - iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos, - key->ttl, df, !net_eq(tunnel->net, dev_net(dev))); + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, + df, !net_eq(tunnel->net, dev_net(dev))); return; tx_error: dev->stats.tx_errors++; -- cgit v1.2.3 From 18e1173d5f3615c8c2680853ba87830ad8a0febc Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Thu, 7 Sep 2017 14:08:35 +0800 Subject: ip6_tunnel: fix setting hop_limit value for ipv6 tunnel Similar to vxlan/geneve tunnel, if hop_limit is zero, it should fall back to ip6_dst_hoplimt(). Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 3a0ba2ae4b0f..10a693a19323 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1184,6 +1184,7 @@ route_lookup: init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } + hop_limit = hop_limit ? : ip6_dst_hoplimit(dst); /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. -- cgit v1.2.3 From 109980b894e9dae66c37c3d804a415aa68b19c7e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 8 Sep 2017 00:14:51 +0200 Subject: bpf: don't select potentially stale ri->map from buggy xdp progs We can potentially run into a couple of issues with the XDP bpf_redirect_map() helper. The ri->map in the per CPU storage can become stale in several ways, mostly due to misuse, where we can then trigger a use after free on the map: i) prog A is calling bpf_redirect_map(), returning XDP_REDIRECT and running on a driver not supporting XDP_REDIRECT yet. The ri->map on that CPU becomes stale when the XDP program is unloaded on the driver, and a prog B loaded on a different driver which supports XDP_REDIRECT return code. prog B would have to omit calling to bpf_redirect_map() and just return XDP_REDIRECT, which would then access the freed map in xdp_do_redirect() since not cleared for that CPU. ii) prog A is calling bpf_redirect_map(), returning a code other than XDP_REDIRECT. prog A is then detached, which triggers release of the map. prog B is attached which, similarly as in i), would just return XDP_REDIRECT without having called bpf_redirect_map() and thus be accessing the freed map in xdp_do_redirect() since not cleared for that CPU. iii) prog A is attached to generic XDP, calling the bpf_redirect_map() helper and returning XDP_REDIRECT. xdp_do_generic_redirect() is currently not handling ri->map (will be fixed by Jesper), so it's not being reset. Later loading a e.g. native prog B which would, say, call bpf_xdp_redirect() and then returns XDP_REDIRECT would find in xdp_do_redirect() that a map was set and uses that causing use after free on map access. Fix thus needs to avoid accessing stale ri->map pointers, naive way would be to call a BPF function from drivers that just resets it to NULL for all XDP return codes but XDP_REDIRECT and including XDP_REDIRECT for drivers not supporting it yet (and let ri->map being handled in xdp_do_generic_redirect()). There is a less intrusive way w/o letting drivers call a reset for each BPF run. The verifier knows we're calling into bpf_xdp_redirect_map() helper, so it can do a small insn rewrite transparent to the prog itself in the sense that it fills R4 with a pointer to the own bpf_prog. We have that pointer at verification time anyway and R4 is allowed to be used as per calling convention we scratch R0 to R5 anyway, so they become inaccessible and program cannot read them prior to a write. Then, the helper would store the prog pointer in the current CPUs struct redirect_info. Later in xdp_do_*_redirect() we check whether the redirect_info's prog pointer is the same as passed xdp_prog pointer, and if that's the case then all good, since the prog holds a ref on the map anyway, so it is always valid at that point in time and must have a reference count of at least 1. If in the unlikely case they are not equal, it means we got a stale pointer, so we clear and bail out right there. Also do reset map and the owning prog in bpf_xdp_redirect(), so that bpf_xdp_redirect_map() and bpf_xdp_redirect() won't get mixed up, only the last call should take precedence. A tc bpf_redirect() doesn't use map anywhere yet, so no need to clear it there since never accessed in that layer. Note that in case the prog is released, and thus the map as well we're still under RCU read critical section at that time and have preemption disabled as well. Once we commit with the __dev_map_insert_ctx() from xdp_do_redirect_map() and set the map to ri->map_to_flush, we still wait for a xdp_do_flush_map() to finish in devmap dismantle time once flush_needed bit is set, so that is fine. Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine") Reported-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 5912c738a7b2..0848df2cd9bf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1794,6 +1794,7 @@ struct redirect_info { u32 flags; struct bpf_map *map; struct bpf_map *map_to_flush; + const struct bpf_prog *map_owner; }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); @@ -1807,7 +1808,6 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; - ri->map = NULL; return TC_ACT_REDIRECT; } @@ -2504,6 +2504,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); + const struct bpf_prog *map_owner = ri->map_owner; struct bpf_map *map = ri->map; u32 index = ri->ifindex; struct net_device *fwd; @@ -2511,6 +2512,15 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ri->ifindex = 0; ri->map = NULL; + ri->map_owner = NULL; + + /* This is really only caused by a deliberately crappy + * BPF program, normally we would never hit that case, + * so no need to inform someone via tracepoints either, + * just bail out. + */ + if (unlikely(map_owner != xdp_prog)) + return -EINVAL; fwd = __dev_map_lookup_elem(map, index); if (!fwd) { @@ -2607,6 +2617,8 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; + ri->map = NULL; + ri->map_owner = NULL; return XDP_REDIRECT; } @@ -2619,7 +2631,8 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags) +BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, + const struct bpf_prog *, map_owner) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); @@ -2629,10 +2642,14 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags ri->ifindex = ifindex; ri->flags = flags; ri->map = map; + ri->map_owner = map_owner; return XDP_REDIRECT; } +/* Note, arg4 is hidden from users and populated by the verifier + * with the right pointer. + */ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .func = bpf_xdp_redirect_map, .gpl_only = false, -- cgit v1.2.3 From bbbe211c295ffb309247adb7b871dda60d92d2d5 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 8 Sep 2017 14:00:30 -0700 Subject: net: rcu lock and preempt disable missing around generic xdp do_xdp_generic must be called inside rcu critical section with preempt disabled to ensure BPF programs are valid and per-cpu variables used for redirect operations are consistent. This patch ensures this is true and fixes the splat below. The netif_receive_skb_internal() code path is now broken into two rcu critical sections. I decided it was better to limit the preempt_enable/disable block to just the xdp static key portion and the fallout is more rcu_read_lock/unlock calls. Seems like the best option to me. [ 607.596901] ============================= [ 607.596906] WARNING: suspicious RCU usage [ 607.596912] 4.13.0-rc4+ #570 Not tainted [ 607.596917] ----------------------------- [ 607.596923] net/core/dev.c:3948 suspicious rcu_dereference_check() usage! [ 607.596927] [ 607.596927] other info that might help us debug this: [ 607.596927] [ 607.596933] [ 607.596933] rcu_scheduler_active = 2, debug_locks = 1 [ 607.596938] 2 locks held by pool/14624: [ 607.596943] #0: (rcu_read_lock_bh){......}, at: [] ip_finish_output2+0x14d/0x890 [ 607.596973] #1: (rcu_read_lock_bh){......}, at: [] __dev_queue_xmit+0x14a/0xfd0 [ 607.597000] [ 607.597000] stack backtrace: [ 607.597006] CPU: 5 PID: 14624 Comm: pool Not tainted 4.13.0-rc4+ #570 [ 607.597011] Hardware name: Dell Inc. Precision Tower 5810/0HHV7N, BIOS A17 03/01/2017 [ 607.597016] Call Trace: [ 607.597027] dump_stack+0x67/0x92 [ 607.597040] lockdep_rcu_suspicious+0xdd/0x110 [ 607.597054] do_xdp_generic+0x313/0xa50 [ 607.597068] ? time_hardirqs_on+0x5b/0x150 [ 607.597076] ? mark_held_locks+0x6b/0xc0 [ 607.597088] ? netdev_pick_tx+0x150/0x150 [ 607.597117] netif_rx_internal+0x205/0x3f0 [ 607.597127] ? do_xdp_generic+0xa50/0xa50 [ 607.597144] ? lock_downgrade+0x2b0/0x2b0 [ 607.597158] ? __lock_is_held+0x93/0x100 [ 607.597187] netif_rx+0x119/0x190 [ 607.597202] loopback_xmit+0xfd/0x1b0 [ 607.597214] dev_hard_start_xmit+0x127/0x4e0 Fixes: d445516966dc ("net: xdp: support xdp generic on virtual devices") Fixes: b5cdae3291f7 ("net: Generic XDP") Acked-by: Daniel Borkmann Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/dev.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 6f845e4fec17..fb766d906148 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3981,8 +3981,13 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), - skb); + int ret; + + preempt_disable(); + rcu_read_lock(); + ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + rcu_read_unlock(); + preempt_enable(); /* Consider XDP consuming the packet a success from * the netdev point of view we do not want to count @@ -4500,18 +4505,20 @@ static int netif_receive_skb_internal(struct sk_buff *skb) if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; - rcu_read_lock(); - if (static_key_false(&generic_xdp_needed)) { - int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), - skb); + int ret; - if (ret != XDP_PASS) { - rcu_read_unlock(); + preempt_disable(); + rcu_read_lock(); + ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + rcu_read_unlock(); + preempt_enable(); + + if (ret != XDP_PASS) return NET_RX_DROP; - } } + rcu_read_lock(); #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; -- cgit v1.2.3 From 9beb8bedb05c5f3a353dba62b8fa7cbbb9ec685e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 9 Sep 2017 01:40:35 +0200 Subject: bpf: make error reporting in bpf_warn_invalid_xdp_action more clear Differ between illegal XDP action code and just driver unsupported one to provide better feedback when we throw a one-time warning here. Reason is that with 814abfabef3c ("xdp: add bpf_redirect helper function") not all drivers support the new XDP return code yet and thus they will fall into their 'default' case when checking for return codes after program return, which then triggers a bpf_warn_invalid_xdp_action() stating that the return code is illegal, but from XDP perspective it's not. I decided not to place something like a XDP_ACT_MAX define into uapi i) given we don't have this either for all other program types, ii) future action codes could have further encoding there, which would render such define unsuitable and we wouldn't be able to rip it out again, and iii) we rarely add new action codes. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 0848df2cd9bf..3a50a9b021e2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3609,7 +3609,11 @@ static bool xdp_is_valid_access(int off, int size, void bpf_warn_invalid_xdp_action(u32 act) { - WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act); + const u32 act_max = XDP_REDIRECT; + + WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n", + act > act_max ? "Illegal" : "Driver unsupported", + act); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -- cgit v1.2.3