diff options
Diffstat (limited to 'net')
210 files changed, 11848 insertions, 1835 deletions
diff --git a/net/Kconfig b/net/Kconfig index 2eeb0e55f7c9..df8d8c9bd021 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -52,6 +52,9 @@ config NET_INGRESS config NET_EGRESS bool +config NET_REDIRECT + bool + config SKB_EXTENSIONS bool diff --git a/net/bluetooth/a2mp.h b/net/bluetooth/a2mp.h index 0029d5119be6..2fd253a61a2a 100644 --- a/net/bluetooth/a2mp.h +++ b/net/bluetooth/a2mp.h @@ -36,14 +36,14 @@ struct a2mp_cmd { __u8 code; __u8 ident; __le16 len; - __u8 data[0]; + __u8 data[]; } __packed; /* A2MP command codes */ #define A2MP_COMMAND_REJ 0x01 struct a2mp_cmd_rej { __le16 reason; - __u8 data[0]; + __u8 data[]; } __packed; #define A2MP_DISCOVER_REQ 0x02 @@ -62,7 +62,7 @@ struct a2mp_cl { struct a2mp_discov_rsp { __le16 mtu; __le16 ext_feat; - struct a2mp_cl cl[0]; + struct a2mp_cl cl[]; } __packed; #define A2MP_CHANGE_NOTIFY 0x04 @@ -93,7 +93,7 @@ struct a2mp_amp_assoc_req { struct a2mp_amp_assoc_rsp { __u8 id; __u8 status; - __u8 amp_assoc[0]; + __u8 amp_assoc[]; } __packed; #define A2MP_CREATEPHYSLINK_REQ 0x0A @@ -101,7 +101,7 @@ struct a2mp_amp_assoc_rsp { struct a2mp_physlink_req { __u8 local_id; __u8 remote_id; - __u8 amp_assoc[0]; + __u8 amp_assoc[]; } __packed; #define A2MP_CREATEPHYSLINK_RSP 0x0B diff --git a/net/bluetooth/bnep/bnep.h b/net/bluetooth/bnep/bnep.h index 24f18b133959..9680473ed7ef 100644 --- a/net/bluetooth/bnep/bnep.h +++ b/net/bluetooth/bnep/bnep.h @@ -74,14 +74,14 @@ struct bnep_setup_conn_req { __u8 type; __u8 ctrl; __u8 uuid_size; - __u8 service[0]; + __u8 service[]; } __packed; struct bnep_set_filter_req { __u8 type; __u8 ctrl; __be16 len; - __u8 list[0]; + __u8 list[]; } __packed; struct bnep_control_rsp { @@ -93,7 +93,7 @@ struct bnep_control_rsp { struct bnep_ext_hdr { __u8 type; __u8 len; - __u8 data[0]; + __u8 data[]; } __packed; /* BNEP ioctl defines */ diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 87691404d0c6..e245bc155cc2 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -467,6 +467,23 @@ static void hci_conn_auto_accept(struct work_struct *work) &conn->dst); } +static void le_disable_advertising(struct hci_dev *hdev) +{ + if (ext_adv_capable(hdev)) { + struct hci_cp_le_set_ext_adv_enable cp; + + cp.enable = 0x00; + cp.num_of_sets = 0x00; + + hci_send_cmd(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp), + &cp); + } else { + u8 enable = 0x00; + hci_send_cmd(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), + &enable); + } +} + static void le_conn_timeout(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, @@ -481,9 +498,8 @@ static void le_conn_timeout(struct work_struct *work) * (which doesn't have a timeout of its own). */ if (conn->role == HCI_ROLE_SLAVE) { - u8 enable = 0x00; - hci_send_cmd(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), - &enable); + /* Disable LE Advertising */ + le_disable_advertising(hdev); hci_le_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT); return; } @@ -898,6 +914,16 @@ static void hci_req_directed_advertising(struct hci_request *req, cp.peer_addr_type = conn->dst_type; bacpy(&cp.peer_addr, &conn->dst); + /* As per Core Spec 5.2 Vol 2, PART E, Sec 7.8.53, for + * advertising_event_property LE_LEGACY_ADV_DIRECT_IND + * does not supports advertising data when the advertising set already + * contains some, the controller shall return erroc code 'Invalid + * HCI Command Parameters(0x12). + * So it is required to remove adv set for handle 0x00. since we use + * instance 0 for directed adv. + */ + hci_req_add(req, HCI_OP_LE_REMOVE_ADV_SET, sizeof(cp.handle), &cp.handle); + hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); if (own_addr_type == ADDR_LE_DEV_RANDOM && @@ -1029,11 +1055,8 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, * anyway have to disable it in order to start directed * advertising. */ - if (hci_dev_test_flag(hdev, HCI_LE_ADV)) { - u8 enable = 0x00; - hci_req_add(&req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), - &enable); - } + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) + __hci_req_disable_advertising(&req); /* If requested to connect as slave use directed advertising */ if (conn->role == HCI_ROLE_SLAVE) { @@ -1725,3 +1748,110 @@ struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle) return hchan; } + +u32 hci_conn_get_phy(struct hci_conn *conn) +{ + u32 phys = 0; + + hci_dev_lock(conn->hdev); + + /* BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 2, Part B page 471: + * Table 6.2: Packets defined for synchronous, asynchronous, and + * CSB logical transport types. + */ + switch (conn->type) { + case SCO_LINK: + /* SCO logical transport (1 Mb/s): + * HV1, HV2, HV3 and DV. + */ + phys |= BT_PHY_BR_1M_1SLOT; + + break; + + case ACL_LINK: + /* ACL logical transport (1 Mb/s) ptt=0: + * DH1, DM3, DH3, DM5 and DH5. + */ + phys |= BT_PHY_BR_1M_1SLOT; + + if (conn->pkt_type & (HCI_DM3 | HCI_DH3)) + phys |= BT_PHY_BR_1M_3SLOT; + + if (conn->pkt_type & (HCI_DM5 | HCI_DH5)) + phys |= BT_PHY_BR_1M_5SLOT; + + /* ACL logical transport (2 Mb/s) ptt=1: + * 2-DH1, 2-DH3 and 2-DH5. + */ + if (!(conn->pkt_type & HCI_2DH1)) + phys |= BT_PHY_EDR_2M_1SLOT; + + if (!(conn->pkt_type & HCI_2DH3)) + phys |= BT_PHY_EDR_2M_3SLOT; + + if (!(conn->pkt_type & HCI_2DH5)) + phys |= BT_PHY_EDR_2M_5SLOT; + + /* ACL logical transport (3 Mb/s) ptt=1: + * 3-DH1, 3-DH3 and 3-DH5. + */ + if (!(conn->pkt_type & HCI_3DH1)) + phys |= BT_PHY_EDR_3M_1SLOT; + + if (!(conn->pkt_type & HCI_3DH3)) + phys |= BT_PHY_EDR_3M_3SLOT; + + if (!(conn->pkt_type & HCI_3DH5)) + phys |= BT_PHY_EDR_3M_5SLOT; + + break; + + case ESCO_LINK: + /* eSCO logical transport (1 Mb/s): EV3, EV4 and EV5 */ + phys |= BT_PHY_BR_1M_1SLOT; + + if (!(conn->pkt_type & (ESCO_EV4 | ESCO_EV5))) + phys |= BT_PHY_BR_1M_3SLOT; + + /* eSCO logical transport (2 Mb/s): 2-EV3, 2-EV5 */ + if (!(conn->pkt_type & ESCO_2EV3)) + phys |= BT_PHY_EDR_2M_1SLOT; + + if (!(conn->pkt_type & ESCO_2EV5)) + phys |= BT_PHY_EDR_2M_3SLOT; + + /* eSCO logical transport (3 Mb/s): 3-EV3, 3-EV5 */ + if (!(conn->pkt_type & ESCO_3EV3)) + phys |= BT_PHY_EDR_3M_1SLOT; + + if (!(conn->pkt_type & ESCO_3EV5)) + phys |= BT_PHY_EDR_3M_3SLOT; + + break; + + case LE_LINK: + if (conn->le_tx_phy & HCI_LE_SET_PHY_1M) + phys |= BT_PHY_LE_1M_TX; + + if (conn->le_rx_phy & HCI_LE_SET_PHY_1M) + phys |= BT_PHY_LE_1M_RX; + + if (conn->le_tx_phy & HCI_LE_SET_PHY_2M) + phys |= BT_PHY_LE_2M_TX; + + if (conn->le_rx_phy & HCI_LE_SET_PHY_2M) + phys |= BT_PHY_LE_2M_RX; + + if (conn->le_tx_phy & HCI_LE_SET_PHY_CODED) + phys |= BT_PHY_LE_CODED_TX; + + if (conn->le_rx_phy & HCI_LE_SET_PHY_CODED) + phys |= BT_PHY_LE_CODED_RX; + + break; + } + + hci_dev_unlock(conn->hdev); + + return phys; +} diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index cbbc34a006d1..2e7bc2da8371 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -31,6 +31,8 @@ #include <linux/debugfs.h> #include <linux/crypto.h> #include <linux/property.h> +#include <linux/suspend.h> +#include <linux/wait.h> #include <asm/unaligned.h> #include <net/bluetooth/bluetooth.h> @@ -603,6 +605,9 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) if (hdev->commands[8] & 0x01) hci_req_add(req, HCI_OP_READ_PAGE_SCAN_ACTIVITY, 0, NULL); + if (hdev->commands[18] & 0x04) + hci_req_add(req, HCI_OP_READ_DEF_ERR_DATA_REPORTING, 0, NULL); + /* Some older Broadcom based Bluetooth 1.2 controllers do not * support the Read Page Scan Type command. Check support for * this command in the bit mask of supported commands. @@ -838,6 +843,26 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) sizeof(support), &support); } + /* Set erroneous data reporting if supported to the wideband speech + * setting value + */ + if (hdev->commands[18] & 0x08) { + bool enabled = hci_dev_test_flag(hdev, + HCI_WIDEBAND_SPEECH_ENABLED); + + if (enabled != + (hdev->err_data_reporting == ERR_DATA_REPORTING_ENABLED)) { + struct hci_cp_write_def_err_data_reporting cp; + + cp.err_data_reporting = enabled ? + ERR_DATA_REPORTING_ENABLED : + ERR_DATA_REPORTING_DISABLED; + + hci_req_add(req, HCI_OP_WRITE_DEF_ERR_DATA_REPORTING, + sizeof(cp), &cp); + } + } + /* Set Suggested Default Data Length to maximum if supported */ if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { struct hci_cp_le_write_def_data_len cp; @@ -1764,6 +1789,9 @@ int hci_dev_do_close(struct hci_dev *hdev) clear_bit(HCI_RUNNING, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_CLOSE); + if (test_and_clear_bit(SUSPEND_POWERING_DOWN, hdev->suspend_tasks)) + wake_up(&hdev->suspend_wait_q); + /* After this point our queues are empty * and no tasks are scheduled. */ hdev->close(hdev); @@ -2285,7 +2313,7 @@ void hci_link_keys_clear(struct hci_dev *hdev) { struct link_key *key; - list_for_each_entry_rcu(key, &hdev->link_keys, list) { + list_for_each_entry(key, &hdev->link_keys, list) { list_del_rcu(&key->list); kfree_rcu(key, rcu); } @@ -2295,7 +2323,7 @@ void hci_smp_ltks_clear(struct hci_dev *hdev) { struct smp_ltk *k; - list_for_each_entry_rcu(k, &hdev->long_term_keys, list) { + list_for_each_entry(k, &hdev->long_term_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } @@ -2305,7 +2333,7 @@ void hci_smp_irks_clear(struct hci_dev *hdev) { struct smp_irk *k; - list_for_each_entry_rcu(k, &hdev->identity_resolving_keys, list) { + list_for_each_entry(k, &hdev->identity_resolving_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } @@ -2315,7 +2343,7 @@ void hci_blocked_keys_clear(struct hci_dev *hdev) { struct blocked_key *b; - list_for_each_entry_rcu(b, &hdev->blocked_keys, list) { + list_for_each_entry(b, &hdev->blocked_keys, list) { list_del_rcu(&b->list); kfree_rcu(b, rcu); } @@ -2327,7 +2355,7 @@ bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16]) struct blocked_key *b; rcu_read_lock(); - list_for_each_entry(b, &hdev->blocked_keys, list) { + list_for_each_entry_rcu(b, &hdev->blocked_keys, list) { if (b->type == type && !memcmp(b->val, val, sizeof(b->val))) { blocked = true; break; @@ -3241,6 +3269,94 @@ void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr, } } +static int hci_suspend_wait_event(struct hci_dev *hdev) +{ +#define WAKE_COND \ + (find_first_bit(hdev->suspend_tasks, __SUSPEND_NUM_TASKS) == \ + __SUSPEND_NUM_TASKS) + + int i; + int ret = wait_event_timeout(hdev->suspend_wait_q, + WAKE_COND, SUSPEND_NOTIFIER_TIMEOUT); + + if (ret == 0) { + bt_dev_dbg(hdev, "Timed out waiting for suspend"); + for (i = 0; i < __SUSPEND_NUM_TASKS; ++i) { + if (test_bit(i, hdev->suspend_tasks)) + bt_dev_dbg(hdev, "Bit %d is set", i); + clear_bit(i, hdev->suspend_tasks); + } + + ret = -ETIMEDOUT; + } else { + ret = 0; + } + + return ret; +} + +static void hci_prepare_suspend(struct work_struct *work) +{ + struct hci_dev *hdev = + container_of(work, struct hci_dev, suspend_prepare); + + hci_dev_lock(hdev); + hci_req_prepare_suspend(hdev, hdev->suspend_state_next); + hci_dev_unlock(hdev); +} + +static int hci_change_suspend_state(struct hci_dev *hdev, + enum suspended_state next) +{ + hdev->suspend_state_next = next; + set_bit(SUSPEND_PREPARE_NOTIFIER, hdev->suspend_tasks); + queue_work(hdev->req_workqueue, &hdev->suspend_prepare); + return hci_suspend_wait_event(hdev); +} + +static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct hci_dev *hdev = + container_of(nb, struct hci_dev, suspend_notifier); + int ret = 0; + + /* If powering down, wait for completion. */ + if (mgmt_powering_down(hdev)) { + set_bit(SUSPEND_POWERING_DOWN, hdev->suspend_tasks); + ret = hci_suspend_wait_event(hdev); + if (ret) + goto done; + } + + /* Suspend notifier should only act on events when powered. */ + if (!hdev_is_powered(hdev)) + goto done; + + if (action == PM_SUSPEND_PREPARE) { + /* Suspend consists of two actions: + * - First, disconnect everything and make the controller not + * connectable (disabling scanning) + * - Second, program event filter/whitelist and enable scan + */ + ret = hci_change_suspend_state(hdev, BT_SUSPEND_DISCONNECT); + + /* Only configure whitelist if disconnect succeeded */ + if (!ret) + ret = hci_change_suspend_state(hdev, + BT_SUSPEND_COMPLETE); + } else if (action == PM_POST_SUSPEND) { + ret = hci_change_suspend_state(hdev, BT_RUNNING); + } + + /* If suspend failed, restore it to running */ + if (ret && action == PM_SUSPEND_PREPARE) + hci_change_suspend_state(hdev, BT_RUNNING); + +done: + return ret ? notifier_from_errno(-EBUSY) : NOTIFY_STOP; +} + /* Alloc HCI device */ struct hci_dev *hci_alloc_dev(void) { @@ -3299,6 +3415,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_LIST_HEAD(&hdev->mgmt_pending); INIT_LIST_HEAD(&hdev->blacklist); INIT_LIST_HEAD(&hdev->whitelist); + INIT_LIST_HEAD(&hdev->wakeable); INIT_LIST_HEAD(&hdev->uuids); INIT_LIST_HEAD(&hdev->link_keys); INIT_LIST_HEAD(&hdev->long_term_keys); @@ -3318,6 +3435,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_WORK(&hdev->tx_work, hci_tx_work); INIT_WORK(&hdev->power_on, hci_power_on); INIT_WORK(&hdev->error_reset, hci_error_reset); + INIT_WORK(&hdev->suspend_prepare, hci_prepare_suspend); INIT_DELAYED_WORK(&hdev->power_off, hci_power_off); @@ -3326,6 +3444,7 @@ struct hci_dev *hci_alloc_dev(void) skb_queue_head_init(&hdev->raw_q); init_waitqueue_head(&hdev->req_wait_q); + init_waitqueue_head(&hdev->suspend_wait_q); INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout); @@ -3437,6 +3556,11 @@ int hci_register_dev(struct hci_dev *hdev) hci_sock_dev_event(hdev, HCI_DEV_REG); hci_dev_hold(hdev); + hdev->suspend_notifier.notifier_call = hci_suspend_notifier; + error = register_pm_notifier(&hdev->suspend_notifier); + if (error) + goto err_wqueue; + queue_work(hdev->req_workqueue, &hdev->power_on); return id; @@ -3470,6 +3594,8 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_dev_do_close(hdev); + unregister_pm_notifier(&hdev->suspend_notifier); + if (!test_bit(HCI_INIT, &hdev->flags) && !hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { @@ -4387,13 +4513,16 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_sco_hdr *hdr = (void *) skb->data; struct hci_conn *conn; - __u16 handle; + __u16 handle, flags; skb_pull(skb, HCI_SCO_HDR_SIZE); handle = __le16_to_cpu(hdr->handle); + flags = hci_flags(handle); + handle = hci_handle(handle); - BT_DBG("%s len %d handle 0x%4.4x", hdev->name, skb->len, handle); + BT_DBG("%s len %d handle 0x%4.4x flags 0x%4.4x", hdev->name, skb->len, + handle, flags); hdev->stat.sco_rx++; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 6ddc4a74a5e4..0a591be8b0ae 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -901,6 +901,37 @@ static void hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev, hdev->inq_tx_power = rp->tx_power; } +static void hci_cc_read_def_err_data_reporting(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_read_def_err_data_reporting *rp = (void *)skb->data; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + return; + + hdev->err_data_reporting = rp->err_data_reporting; +} + +static void hci_cc_write_def_err_data_reporting(struct hci_dev *hdev, + struct sk_buff *skb) +{ + __u8 status = *((__u8 *)skb->data); + struct hci_cp_write_def_err_data_reporting *cp; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_WRITE_DEF_ERR_DATA_REPORTING); + if (!cp) + return; + + hdev->err_data_reporting = cp->err_data_reporting; +} + static void hci_cc_pin_code_reply(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_rp_pin_code_reply *rp = (void *) skb->data; @@ -2202,10 +2233,22 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); - if (conn) + if (conn) { + u8 type = conn->type; + mgmt_disconnect_failed(hdev, &conn->dst, conn->type, conn->dst_type, status); + /* If the disconnection failed for any reason, the upper layer + * does not retry to disconnect in current implementation. + * Hence, we need to do some basic cleanup here and re-enable + * advertising if necessary. + */ + hci_conn_del(conn); + if (type == LE_LINK) + hci_req_reenable_advertising(hdev); + } + hci_dev_unlock(hdev); } @@ -2474,6 +2517,7 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_ev_conn_complete *ev = (void *) skb->data; + struct inquiry_entry *ie; struct hci_conn *conn; BT_DBG("%s", hdev->name); @@ -2482,14 +2526,30 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); if (!conn) { - if (ev->link_type != SCO_LINK) - goto unlock; + /* Connection may not exist if auto-connected. Check the inquiry + * cache to see if we've already discovered this bdaddr before. + * If found and link is an ACL type, create a connection class + * automatically. + */ + ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr); + if (ie && ev->link_type == ACL_LINK) { + conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr, + HCI_ROLE_SLAVE); + if (!conn) { + bt_dev_err(hdev, "no memory for new conn"); + goto unlock; + } + } else { + if (ev->link_type != SCO_LINK) + goto unlock; - conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr); - if (!conn) - goto unlock; + conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, + &ev->bdaddr); + if (!conn) + goto unlock; - conn->type = SCO_LINK; + conn->type = SCO_LINK; + } } if (!ev->status) { @@ -2743,6 +2803,14 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) hci_disconn_cfm(conn, ev->reason); hci_conn_del(conn); + /* The suspend notifier is waiting for all devices to disconnect so + * clear the bit from pending tasks and inform the wait queue. + */ + if (list_empty(&hdev->conn_hash.list) && + test_and_clear_bit(SUSPEND_DISCONNECTING, hdev->suspend_tasks)) { + wake_up(&hdev->suspend_wait_q); + } + /* Re-enable advertising if necessary, since it might * have been disabled by the connection. From the * HCI_LE_Set_Advertise_Enable command description in @@ -2895,14 +2963,14 @@ static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status, if (!conn) goto unlock; - /* If we fail to read the encryption key size, assume maximum - * (which is the same we do also when this HCI command isn't - * supported. + /* While unexpected, the read_enc_key_size command may fail. The most + * secure approach is to then assume the key size is 0 to force a + * disconnection. */ if (rp->status) { bt_dev_err(hdev, "failed to read key size for handle %u", handle); - conn->enc_key_size = HCI_LINK_KEY_SIZE; + conn->enc_key_size = 0; } else { conn->enc_key_size = rp->key_size; } @@ -3302,6 +3370,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_read_inq_rsp_tx_power(hdev, skb); break; + case HCI_OP_READ_DEF_ERR_DATA_REPORTING: + hci_cc_read_def_err_data_reporting(hdev, skb); + break; + + case HCI_OP_WRITE_DEF_ERR_DATA_REPORTING: + hci_cc_write_def_err_data_reporting(hdev, skb); + break; + case HCI_OP_PIN_CODE_REPLY: hci_cc_pin_code_reply(hdev, skb); break; @@ -4557,6 +4633,16 @@ static void hci_user_confirm_request_evt(struct hci_dev *hdev, goto confirm; } + /* If there already exists link key in local host, leave the + * decision to user space since the remote device could be + * legitimate or malicious. + */ + if (hci_find_link_key(hdev, &ev->bdaddr)) { + bt_dev_dbg(hdev, "Local host already has link key"); + confirm_hint = 1; + goto confirm; + } + BT_DBG("Auto-accept of user confirmation with %ums delay", hdev->auto_accept_delay); @@ -5858,6 +5944,11 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) u8 status = 0, event = hdr->evt, req_evt = 0; u16 opcode = HCI_OP_NOP; + if (!event) { + bt_dev_warn(hdev, "Received unexpected HCI Event 00000000"); + goto done; + } + if (hdev->sent_cmd && bt_cb(hdev->sent_cmd)->hci.req_event == event) { struct hci_command_hdr *cmd_hdr = (void *) hdev->sent_cmd->data; opcode = __le16_to_cpu(cmd_hdr->opcode); @@ -6069,6 +6160,7 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) req_complete_skb(hdev, status, opcode, orig_skb); } +done: kfree_skb(orig_skb); kfree_skb(skb); hdev->stat.evt_rx++; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 2a1b64dbf76e..649e1e5ed446 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -34,6 +34,9 @@ #define HCI_REQ_PEND 1 #define HCI_REQ_CANCELED 2 +#define LE_SUSPEND_SCAN_WINDOW 0x0012 +#define LE_SUSPEND_SCAN_INTERVAL 0x0060 + void hci_req_init(struct hci_request *req, struct hci_dev *hdev) { skb_queue_head_init(&req->cmd_q); @@ -654,6 +657,11 @@ void hci_req_add_le_scan_disable(struct hci_request *req) { struct hci_dev *hdev = req->hdev; + if (hdev->scanning_paused) { + bt_dev_dbg(hdev, "Scanning is paused for suspend"); + return; + } + if (use_ext_scan(hdev)) { struct hci_cp_le_set_ext_scan_enable cp; @@ -670,15 +678,55 @@ void hci_req_add_le_scan_disable(struct hci_request *req) } } -static void add_to_white_list(struct hci_request *req, - struct hci_conn_params *params) +static void del_from_white_list(struct hci_request *req, bdaddr_t *bdaddr, + u8 bdaddr_type) +{ + struct hci_cp_le_del_from_white_list cp; + + cp.bdaddr_type = bdaddr_type; + bacpy(&cp.bdaddr, bdaddr); + + bt_dev_dbg(req->hdev, "Remove %pMR (0x%x) from whitelist", &cp.bdaddr, + cp.bdaddr_type); + hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, sizeof(cp), &cp); +} + +/* Adds connection to white list if needed. On error, returns -1. */ +static int add_to_white_list(struct hci_request *req, + struct hci_conn_params *params, u8 *num_entries, + bool allow_rpa) { struct hci_cp_le_add_to_white_list cp; + struct hci_dev *hdev = req->hdev; + + /* Already in white list */ + if (hci_bdaddr_list_lookup(&hdev->le_white_list, ¶ms->addr, + params->addr_type)) + return 0; + + /* Select filter policy to accept all advertising */ + if (*num_entries >= hdev->le_white_list_size) + return -1; + + /* White list can not be used with RPAs */ + if (!allow_rpa && + hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type)) { + return -1; + } + /* During suspend, only wakeable devices can be in whitelist */ + if (hdev->suspended && !params->wakeable) + return 0; + + *num_entries += 1; cp.bdaddr_type = params->addr_type; bacpy(&cp.bdaddr, ¶ms->addr); + bt_dev_dbg(hdev, "Add %pMR (0x%x) to whitelist", &cp.bdaddr, + cp.bdaddr_type); hci_req_add(req, HCI_OP_LE_ADD_TO_WHITE_LIST, sizeof(cp), &cp); + + return 0; } static u8 update_white_list(struct hci_request *req) @@ -686,7 +734,14 @@ static u8 update_white_list(struct hci_request *req) struct hci_dev *hdev = req->hdev; struct hci_conn_params *params; struct bdaddr_list *b; - uint8_t white_list_entries = 0; + u8 num_entries = 0; + bool pend_conn, pend_report; + /* We allow whitelisting even with RPAs in suspend. In the worst case, + * we won't be able to wake from devices that use the privacy1.2 + * features. Additionally, once we support privacy1.2 and IRK + * offloading, we can update this to also check for those conditions. + */ + bool allow_rpa = hdev->suspended; /* Go through the current white list programmed into the * controller one by one and check if that address is still @@ -695,29 +750,28 @@ static u8 update_white_list(struct hci_request *req) * command to remove it from the controller. */ list_for_each_entry(b, &hdev->le_white_list, list) { - /* If the device is neither in pend_le_conns nor - * pend_le_reports then remove it from the whitelist. + pend_conn = hci_pend_le_action_lookup(&hdev->pend_le_conns, + &b->bdaddr, + b->bdaddr_type); + pend_report = hci_pend_le_action_lookup(&hdev->pend_le_reports, + &b->bdaddr, + b->bdaddr_type); + + /* If the device is not likely to connect or report, + * remove it from the whitelist. */ - if (!hci_pend_le_action_lookup(&hdev->pend_le_conns, - &b->bdaddr, b->bdaddr_type) && - !hci_pend_le_action_lookup(&hdev->pend_le_reports, - &b->bdaddr, b->bdaddr_type)) { - struct hci_cp_le_del_from_white_list cp; - - cp.bdaddr_type = b->bdaddr_type; - bacpy(&cp.bdaddr, &b->bdaddr); - - hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST, - sizeof(cp), &cp); + if (!pend_conn && !pend_report) { + del_from_white_list(req, &b->bdaddr, b->bdaddr_type); continue; } - if (hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) { - /* White list can not be used with RPAs */ + /* White list can not be used with RPAs */ + if (!allow_rpa && + hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) { return 0x00; } - white_list_entries++; + num_entries++; } /* Since all no longer valid white list entries have been @@ -731,47 +785,17 @@ static u8 update_white_list(struct hci_request *req) * white list. */ list_for_each_entry(params, &hdev->pend_le_conns, action) { - if (hci_bdaddr_list_lookup(&hdev->le_white_list, - ¶ms->addr, params->addr_type)) - continue; - - if (white_list_entries >= hdev->le_white_list_size) { - /* Select filter policy to accept all advertising */ + if (add_to_white_list(req, params, &num_entries, allow_rpa)) return 0x00; - } - - if (hci_find_irk_by_addr(hdev, ¶ms->addr, - params->addr_type)) { - /* White list can not be used with RPAs */ - return 0x00; - } - - white_list_entries++; - add_to_white_list(req, params); } /* After adding all new pending connections, walk through * the list of pending reports and also add these to the - * white list if there is still space. + * white list if there is still space. Abort if space runs out. */ list_for_each_entry(params, &hdev->pend_le_reports, action) { - if (hci_bdaddr_list_lookup(&hdev->le_white_list, - ¶ms->addr, params->addr_type)) - continue; - - if (white_list_entries >= hdev->le_white_list_size) { - /* Select filter policy to accept all advertising */ - return 0x00; - } - - if (hci_find_irk_by_addr(hdev, ¶ms->addr, - params->addr_type)) { - /* White list can not be used with RPAs */ + if (add_to_white_list(req, params, &num_entries, allow_rpa)) return 0x00; - } - - white_list_entries++; - add_to_white_list(req, params); } /* Select filter policy to use white list */ @@ -866,6 +890,12 @@ void hci_req_add_le_passive_scan(struct hci_request *req) struct hci_dev *hdev = req->hdev; u8 own_addr_type; u8 filter_policy; + u8 window, interval; + + if (hdev->scanning_paused) { + bt_dev_dbg(hdev, "Scanning is paused for suspend"); + return; + } /* Set require_privacy to false since no SCAN_REQ are send * during passive scanning. Not using an non-resolvable address @@ -896,8 +926,17 @@ void hci_req_add_le_passive_scan(struct hci_request *req) (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY)) filter_policy |= 0x02; - hci_req_start_scan(req, LE_SCAN_PASSIVE, hdev->le_scan_interval, - hdev->le_scan_window, own_addr_type, filter_policy); + if (hdev->suspended) { + window = LE_SUSPEND_SCAN_WINDOW; + interval = LE_SUSPEND_SCAN_INTERVAL; + } else { + window = hdev->le_scan_window; + interval = hdev->le_scan_interval; + } + + bt_dev_dbg(hdev, "LE passive scan with whitelist = %d", filter_policy); + hci_req_start_scan(req, LE_SCAN_PASSIVE, interval, window, + own_addr_type, filter_policy); } static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance) @@ -918,6 +957,187 @@ static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance) return adv_instance->scan_rsp_len; } +static void hci_req_clear_event_filter(struct hci_request *req) +{ + struct hci_cp_set_event_filter f; + + memset(&f, 0, sizeof(f)); + f.flt_type = HCI_FLT_CLEAR_ALL; + hci_req_add(req, HCI_OP_SET_EVENT_FLT, 1, &f); + + /* Update page scan state (since we may have modified it when setting + * the event filter). + */ + __hci_req_update_scan(req); +} + +static void hci_req_set_event_filter(struct hci_request *req) +{ + struct bdaddr_list *b; + struct hci_cp_set_event_filter f; + struct hci_dev *hdev = req->hdev; + u8 scan; + + /* Always clear event filter when starting */ + hci_req_clear_event_filter(req); + + list_for_each_entry(b, &hdev->wakeable, list) { + memset(&f, 0, sizeof(f)); + bacpy(&f.addr_conn_flt.bdaddr, &b->bdaddr); + f.flt_type = HCI_FLT_CONN_SETUP; + f.cond_type = HCI_CONN_SETUP_ALLOW_BDADDR; + f.addr_conn_flt.auto_accept = HCI_CONN_SETUP_AUTO_ON; + + bt_dev_dbg(hdev, "Adding event filters for %pMR", &b->bdaddr); + hci_req_add(req, HCI_OP_SET_EVENT_FLT, sizeof(f), &f); + } + + scan = !list_empty(&hdev->wakeable) ? SCAN_PAGE : SCAN_DISABLED; + hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); +} + +static void hci_req_config_le_suspend_scan(struct hci_request *req) +{ + /* Can't change params without disabling first */ + hci_req_add_le_scan_disable(req); + + /* Configure params and enable scanning */ + hci_req_add_le_passive_scan(req); + + /* Block suspend notifier on response */ + set_bit(SUSPEND_SCAN_ENABLE, req->hdev->suspend_tasks); +} + +static void suspend_req_complete(struct hci_dev *hdev, u8 status, u16 opcode) +{ + bt_dev_dbg(hdev, "Request complete opcode=0x%x, status=0x%x", opcode, + status); + if (test_and_clear_bit(SUSPEND_SCAN_ENABLE, hdev->suspend_tasks) || + test_and_clear_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks)) { + wake_up(&hdev->suspend_wait_q); + } +} + +/* Call with hci_dev_lock */ +void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) +{ + int old_state; + struct hci_conn *conn; + struct hci_request req; + u8 page_scan; + int disconnect_counter; + + if (next == hdev->suspend_state) { + bt_dev_dbg(hdev, "Same state before and after: %d", next); + goto done; + } + + hdev->suspend_state = next; + hci_req_init(&req, hdev); + + if (next == BT_SUSPEND_DISCONNECT) { + /* Mark device as suspended */ + hdev->suspended = true; + + /* Pause discovery if not already stopped */ + old_state = hdev->discovery.state; + if (old_state != DISCOVERY_STOPPED) { + set_bit(SUSPEND_PAUSE_DISCOVERY, hdev->suspend_tasks); + hci_discovery_set_state(hdev, DISCOVERY_STOPPING); + queue_work(hdev->req_workqueue, &hdev->discov_update); + } + + hdev->discovery_paused = true; + hdev->discovery_old_state = old_state; + + /* Stop advertising */ + old_state = hci_dev_test_flag(hdev, HCI_ADVERTISING); + if (old_state) { + set_bit(SUSPEND_PAUSE_ADVERTISING, hdev->suspend_tasks); + cancel_delayed_work(&hdev->discov_off); + queue_delayed_work(hdev->req_workqueue, + &hdev->discov_off, 0); + } + + hdev->advertising_paused = true; + hdev->advertising_old_state = old_state; + /* Disable page scan */ + page_scan = SCAN_DISABLED; + hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &page_scan); + + /* Disable LE passive scan */ + hci_req_add_le_scan_disable(&req); + + /* Mark task needing completion */ + set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); + + /* Prevent disconnects from causing scanning to be re-enabled */ + hdev->scanning_paused = true; + + /* Run commands before disconnecting */ + hci_req_run(&req, suspend_req_complete); + + disconnect_counter = 0; + /* Soft disconnect everything (power off) */ + list_for_each_entry(conn, &hdev->conn_hash.list, list) { + hci_disconnect(conn, HCI_ERROR_REMOTE_POWER_OFF); + disconnect_counter++; + } + + if (disconnect_counter > 0) { + bt_dev_dbg(hdev, + "Had %d disconnects. Will wait on them", + disconnect_counter); + set_bit(SUSPEND_DISCONNECTING, hdev->suspend_tasks); + } + } else if (next == BT_SUSPEND_COMPLETE) { + /* Unpause to take care of updating scanning params */ + hdev->scanning_paused = false; + /* Enable event filter for paired devices */ + hci_req_set_event_filter(&req); + /* Enable passive scan at lower duty cycle */ + hci_req_config_le_suspend_scan(&req); + /* Pause scan changes again. */ + hdev->scanning_paused = true; + hci_req_run(&req, suspend_req_complete); + } else { + hdev->suspended = false; + hdev->scanning_paused = false; + + hci_req_clear_event_filter(&req); + /* Reset passive/background scanning to normal */ + hci_req_config_le_suspend_scan(&req); + + /* Unpause advertising */ + hdev->advertising_paused = false; + if (hdev->advertising_old_state) { + set_bit(SUSPEND_UNPAUSE_ADVERTISING, + hdev->suspend_tasks); + hci_dev_set_flag(hdev, HCI_ADVERTISING); + queue_work(hdev->req_workqueue, + &hdev->discoverable_update); + hdev->advertising_old_state = 0; + } + + /* Unpause discovery */ + hdev->discovery_paused = false; + if (hdev->discovery_old_state != DISCOVERY_STOPPED && + hdev->discovery_old_state != DISCOVERY_STOPPING) { + set_bit(SUSPEND_UNPAUSE_DISCOVERY, hdev->suspend_tasks); + hci_discovery_set_state(hdev, DISCOVERY_STARTING); + queue_work(hdev->req_workqueue, &hdev->discov_update); + } + + hci_req_run(&req, suspend_req_complete); + } + + hdev->suspend_state = next; + +done: + clear_bit(SUSPEND_PREPARE_NOTIFIER, hdev->suspend_tasks); + wake_up(&hdev->suspend_wait_q); +} + static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev) { u8 instance = hdev->cur_adv_instance; @@ -1499,7 +1719,7 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); if (err < 0) { - BT_ERR("%s failed to generate new RPA", hdev->name); + bt_dev_err(hdev, "failed to generate new RPA"); return err; } @@ -2015,6 +2235,9 @@ void __hci_req_update_scan(struct hci_request *req) if (mgmt_powering_down(hdev)) return; + if (hdev->scanning_paused) + return; + if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) || disconnected_whitelist_entries(hdev)) scan = SCAN_PAGE; @@ -2504,23 +2727,6 @@ static int active_scan(struct hci_request *req, unsigned long opt) BT_DBG("%s", hdev->name); - if (hci_dev_test_flag(hdev, HCI_LE_ADV)) { - hci_dev_lock(hdev); - - /* Don't let discovery abort an outgoing connection attempt - * that's using directed advertising. - */ - if (hci_lookup_le_connect(hdev)) { - hci_dev_unlock(hdev); - return -EBUSY; - } - - cancel_adv_timeout(hdev); - hci_dev_unlock(hdev); - - __hci_req_disable_advertising(req); - } - /* If controller is scanning, it means the background scanning is * running. Thus, we should temporarily stop it in order to set the * discovery scanning parameters. diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index a7019fbeadd3..0e81614d235e 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -68,6 +68,8 @@ void __hci_req_update_eir(struct hci_request *req); void hci_req_add_le_scan_disable(struct hci_request *req); void hci_req_add_le_passive_scan(struct hci_request *req); +void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next); + void hci_req_reenable_advertising(struct hci_dev *hdev); void __hci_req_enable_advertising(struct hci_request *req); void __hci_req_disable_advertising(struct hci_request *req); diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index bef84b95e2c4..3b4fa27a44e6 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -1279,7 +1279,7 @@ static int hidp_session_thread(void *arg) add_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait); /* This memory barrier is paired with wq_has_sleeper(). See * sock_poll_wait() for more information why this is needed. */ - smp_mb(); + smp_mb__before_atomic(); /* notify synchronous startup that we're ready */ atomic_inc(&session->state); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 195459a1e53e..fd9d0d08f9c9 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -45,6 +45,7 @@ #define LE_FLOWCTL_MAX_CREDITS 65535 bool disable_ertm; +bool enable_ecred; static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD; @@ -419,6 +420,9 @@ static void l2cap_chan_timeout(struct work_struct *work) BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); mutex_lock(&conn->chan_lock); + /* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling + * this work. No need to call l2cap_chan_hold(chan) here again. + */ l2cap_chan_lock(chan); if (chan->state == BT_CONNECTED || chan->state == BT_CONFIG) @@ -431,12 +435,12 @@ static void l2cap_chan_timeout(struct work_struct *work) l2cap_chan_close(chan, reason); - l2cap_chan_unlock(chan); - chan->ops->close(chan); - mutex_unlock(&conn->chan_lock); + l2cap_chan_unlock(chan); l2cap_chan_put(chan); + + mutex_unlock(&conn->chan_lock); } struct l2cap_chan *l2cap_chan_create(void) @@ -532,6 +536,17 @@ static void l2cap_le_flowctl_init(struct l2cap_chan *chan, u16 tx_credits) skb_queue_head_init(&chan->tx_q); } +static void l2cap_ecred_init(struct l2cap_chan *chan, u16 tx_credits) +{ + l2cap_le_flowctl_init(chan, tx_credits); + + /* L2CAP implementations shall support a minimum MPS of 64 octets */ + if (chan->mps < L2CAP_ECRED_MIN_MPS) { + chan->mps = L2CAP_ECRED_MIN_MPS; + chan->rx_credits = (chan->imtu / chan->mps) + 1; + } +} + void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) { BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn, @@ -638,6 +653,7 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) break; case L2CAP_MODE_LE_FLOWCTL: + case L2CAP_MODE_EXT_FLOWCTL: skb_queue_purge(&chan->tx_q); break; @@ -662,6 +678,29 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) } EXPORT_SYMBOL_GPL(l2cap_chan_del); +static void __l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func, + void *data) +{ + struct l2cap_chan *chan; + + list_for_each_entry(chan, &conn->chan_l, list) { + func(chan, data); + } +} + +void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func, + void *data) +{ + if (!conn) + return; + + mutex_lock(&conn->chan_lock); + __l2cap_chan_list(conn, func, data); + mutex_unlock(&conn->chan_lock); +} + +EXPORT_SYMBOL_GPL(l2cap_chan_list); + static void l2cap_conn_update_id_addr(struct work_struct *work) { struct l2cap_conn *conn = container_of(work, struct l2cap_conn, @@ -704,6 +743,27 @@ static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan) &rsp); } +static void l2cap_chan_ecred_connect_reject(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + struct l2cap_ecred_conn_rsp rsp; + u16 result; + + if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) + result = L2CAP_CR_LE_AUTHORIZATION; + else + result = L2CAP_CR_LE_BAD_PSM; + + l2cap_state_change(chan, BT_DISCONN); + + memset(&rsp, 0, sizeof(rsp)); + + rsp.result = cpu_to_le16(result); + + l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp), + &rsp); +} + static void l2cap_chan_connect_reject(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; @@ -749,8 +809,16 @@ void l2cap_chan_close(struct l2cap_chan *chan, int reason) if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED) { if (conn->hcon->type == ACL_LINK) l2cap_chan_connect_reject(chan); - else if (conn->hcon->type == LE_LINK) - l2cap_chan_le_connect_reject(chan); + else if (conn->hcon->type == LE_LINK) { + switch (chan->mode) { + case L2CAP_MODE_LE_FLOWCTL: + l2cap_chan_le_connect_reject(chan); + break; + case L2CAP_MODE_EXT_FLOWCTL: + l2cap_chan_ecred_connect_reject(chan); + break; + } + } } l2cap_chan_del(chan, reason); @@ -1273,8 +1341,13 @@ static void l2cap_chan_ready(struct l2cap_chan *chan) chan->conf_state = 0; __clear_chan_timer(chan); - if (chan->mode == L2CAP_MODE_LE_FLOWCTL && !chan->tx_credits) - chan->ops->suspend(chan); + switch (chan->mode) { + case L2CAP_MODE_LE_FLOWCTL: + case L2CAP_MODE_EXT_FLOWCTL: + if (!chan->tx_credits) + chan->ops->suspend(chan); + break; + } chan->state = BT_CONNECTED; @@ -1306,6 +1379,81 @@ static void l2cap_le_connect(struct l2cap_chan *chan) sizeof(req), &req); } +struct l2cap_ecred_conn_data { + struct { + struct l2cap_ecred_conn_req req; + __le16 scid[5]; + } __packed pdu; + struct l2cap_chan *chan; + struct pid *pid; + int count; +}; + +static void l2cap_ecred_defer_connect(struct l2cap_chan *chan, void *data) +{ + struct l2cap_ecred_conn_data *conn = data; + struct pid *pid; + + if (chan == conn->chan) + return; + + if (!test_and_clear_bit(FLAG_DEFER_SETUP, &chan->flags)) + return; + + pid = chan->ops->get_peer_pid(chan); + + /* Only add deferred channels with the same PID/PSM */ + if (conn->pid != pid || chan->psm != conn->chan->psm || chan->ident || + chan->mode != L2CAP_MODE_EXT_FLOWCTL || chan->state != BT_CONNECT) + return; + + if (test_and_set_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags)) + return; + + l2cap_ecred_init(chan, 0); + + /* Set the same ident so we can match on the rsp */ + chan->ident = conn->chan->ident; + + /* Include all channels deferred */ + conn->pdu.scid[conn->count] = cpu_to_le16(chan->scid); + + conn->count++; +} + +static void l2cap_ecred_connect(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + struct l2cap_ecred_conn_data data; + + if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) + return; + + if (test_and_set_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags)) + return; + + l2cap_ecred_init(chan, 0); + + data.pdu.req.psm = chan->psm; + data.pdu.req.mtu = cpu_to_le16(chan->imtu); + data.pdu.req.mps = cpu_to_le16(chan->mps); + data.pdu.req.credits = cpu_to_le16(chan->rx_credits); + data.pdu.scid[0] = cpu_to_le16(chan->scid); + + chan->ident = l2cap_get_ident(conn); + data.pid = chan->ops->get_peer_pid(chan); + + data.count = 1; + data.chan = chan; + data.pid = chan->ops->get_peer_pid(chan); + + __l2cap_chan_list(conn, l2cap_ecred_defer_connect, &data); + + l2cap_send_cmd(conn, chan->ident, L2CAP_ECRED_CONN_REQ, + sizeof(data.pdu.req) + data.count * sizeof(__le16), + &data.pdu); +} + static void l2cap_le_start(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; @@ -1318,8 +1466,12 @@ static void l2cap_le_start(struct l2cap_chan *chan) return; } - if (chan->state == BT_CONNECT) - l2cap_le_connect(chan); + if (chan->state == BT_CONNECT) { + if (chan->mode == L2CAP_MODE_EXT_FLOWCTL) + l2cap_ecred_connect(chan); + else + l2cap_le_connect(chan); + } } static void l2cap_start_connection(struct l2cap_chan *chan) @@ -1737,9 +1889,9 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) l2cap_chan_del(chan, err); - l2cap_chan_unlock(chan); - chan->ops->close(chan); + + l2cap_chan_unlock(chan); l2cap_chan_put(chan); } @@ -2505,6 +2657,7 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) switch (chan->mode) { case L2CAP_MODE_LE_FLOWCTL: + case L2CAP_MODE_EXT_FLOWCTL: /* Check outgoing MTU */ if (len > chan->omtu) return -EMSGSIZE; @@ -3773,6 +3926,45 @@ void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan) &rsp); } +void __l2cap_ecred_conn_rsp_defer(struct l2cap_chan *chan) +{ + struct { + struct l2cap_ecred_conn_rsp rsp; + __le16 dcid[5]; + } __packed pdu; + struct l2cap_conn *conn = chan->conn; + u16 ident = chan->ident; + int i = 0; + + if (!ident) + return; + + BT_DBG("chan %p ident %d", chan, ident); + + pdu.rsp.mtu = cpu_to_le16(chan->imtu); + pdu.rsp.mps = cpu_to_le16(chan->mps); + pdu.rsp.credits = cpu_to_le16(chan->rx_credits); + pdu.rsp.result = cpu_to_le16(L2CAP_CR_LE_SUCCESS); + + mutex_lock(&conn->chan_lock); + + list_for_each_entry(chan, &conn->chan_l, list) { + if (chan->ident != ident) + continue; + + /* Reset ident so only one response is sent */ + chan->ident = 0; + + /* Include all channels pending with the same ident */ + pdu.dcid[i++] = cpu_to_le16(chan->scid); + } + + mutex_unlock(&conn->chan_lock); + + l2cap_send_cmd(conn, ident, L2CAP_ECRED_CONN_RSP, + sizeof(pdu.rsp) + i * sizeof(__le16), &pdu); +} + void __l2cap_connect_rsp_defer(struct l2cap_chan *chan) { struct l2cap_conn_rsp rsp; @@ -4181,7 +4373,8 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, return 0; } - if (chan->state != BT_CONFIG && chan->state != BT_CONNECT2) { + if (chan->state != BT_CONFIG && chan->state != BT_CONNECT2 && + chan->state != BT_CONNECTED) { cmd_reject_invalid_cid(conn, cmd->ident, chan->scid, chan->dcid); goto unlock; @@ -4405,6 +4598,7 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, return 0; } + l2cap_chan_hold(chan); l2cap_chan_lock(chan); rsp.dcid = cpu_to_le16(chan->scid); @@ -4413,12 +4607,11 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, chan->ops->set_shutdown(chan); - l2cap_chan_hold(chan); l2cap_chan_del(chan, ECONNRESET); - l2cap_chan_unlock(chan); - chan->ops->close(chan); + + l2cap_chan_unlock(chan); l2cap_chan_put(chan); mutex_unlock(&conn->chan_lock); @@ -4450,20 +4643,21 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, return 0; } + l2cap_chan_hold(chan); l2cap_chan_lock(chan); if (chan->state != BT_DISCONN) { l2cap_chan_unlock(chan); + l2cap_chan_put(chan); mutex_unlock(&conn->chan_lock); return 0; } - l2cap_chan_hold(chan); l2cap_chan_del(chan, 0); - l2cap_chan_unlock(chan); - chan->ops->close(chan); + + l2cap_chan_unlock(chan); l2cap_chan_put(chan); mutex_unlock(&conn->chan_lock); @@ -5714,6 +5908,356 @@ static inline int l2cap_le_credits(struct l2cap_conn *conn, return 0; } +static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) +{ + struct l2cap_ecred_conn_req *req = (void *) data; + struct { + struct l2cap_ecred_conn_rsp rsp; + __le16 dcid[5]; + } __packed pdu; + struct l2cap_chan *chan, *pchan; + u16 mtu, mps; + __le16 psm; + u8 result, len = 0; + int i, num_scid; + bool defer = false; + + if (!enable_ecred) + return -EINVAL; + + if (cmd_len < sizeof(*req) || cmd_len - sizeof(*req) % sizeof(u16)) { + result = L2CAP_CR_LE_INVALID_PARAMS; + goto response; + } + + mtu = __le16_to_cpu(req->mtu); + mps = __le16_to_cpu(req->mps); + + if (mtu < L2CAP_ECRED_MIN_MTU || mps < L2CAP_ECRED_MIN_MPS) { + result = L2CAP_CR_LE_UNACCEPT_PARAMS; + goto response; + } + + psm = req->psm; + + BT_DBG("psm 0x%2.2x mtu %u mps %u", __le16_to_cpu(psm), mtu, mps); + + memset(&pdu, 0, sizeof(pdu)); + + /* Check if we have socket listening on psm */ + pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src, + &conn->hcon->dst, LE_LINK); + if (!pchan) { + result = L2CAP_CR_LE_BAD_PSM; + goto response; + } + + mutex_lock(&conn->chan_lock); + l2cap_chan_lock(pchan); + + if (!smp_sufficient_security(conn->hcon, pchan->sec_level, + SMP_ALLOW_STK)) { + result = L2CAP_CR_LE_AUTHENTICATION; + goto unlock; + } + + result = L2CAP_CR_LE_SUCCESS; + cmd_len -= sizeof(req); + num_scid = cmd_len / sizeof(u16); + + for (i = 0; i < num_scid; i++) { + u16 scid = __le16_to_cpu(req->scid[i]); + + BT_DBG("scid[%d] 0x%4.4x", i, scid); + + pdu.dcid[i] = 0x0000; + len += sizeof(*pdu.dcid); + + /* Check for valid dynamic CID range */ + if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) { + result = L2CAP_CR_LE_INVALID_SCID; + continue; + } + + /* Check if we already have channel with that dcid */ + if (__l2cap_get_chan_by_dcid(conn, scid)) { + result = L2CAP_CR_LE_SCID_IN_USE; + continue; + } + + chan = pchan->ops->new_connection(pchan); + if (!chan) { + result = L2CAP_CR_LE_NO_MEM; + continue; + } + + bacpy(&chan->src, &conn->hcon->src); + bacpy(&chan->dst, &conn->hcon->dst); + chan->src_type = bdaddr_src_type(conn->hcon); + chan->dst_type = bdaddr_dst_type(conn->hcon); + chan->psm = psm; + chan->dcid = scid; + chan->omtu = mtu; + chan->remote_mps = mps; + + __l2cap_chan_add(conn, chan); + + l2cap_ecred_init(chan, __le16_to_cpu(req->credits)); + + /* Init response */ + if (!pdu.rsp.credits) { + pdu.rsp.mtu = cpu_to_le16(chan->imtu); + pdu.rsp.mps = cpu_to_le16(chan->mps); + pdu.rsp.credits = cpu_to_le16(chan->rx_credits); + } + + pdu.dcid[i] = cpu_to_le16(chan->scid); + + __set_chan_timer(chan, chan->ops->get_sndtimeo(chan)); + + chan->ident = cmd->ident; + + if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) { + l2cap_state_change(chan, BT_CONNECT2); + defer = true; + chan->ops->defer(chan); + } else { + l2cap_chan_ready(chan); + } + } + +unlock: + l2cap_chan_unlock(pchan); + mutex_unlock(&conn->chan_lock); + l2cap_chan_put(pchan); + +response: + pdu.rsp.result = cpu_to_le16(result); + + if (defer) + return 0; + + l2cap_send_cmd(conn, cmd->ident, L2CAP_ECRED_CONN_RSP, + sizeof(pdu.rsp) + len, &pdu); + + return 0; +} + +static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) +{ + struct l2cap_ecred_conn_rsp *rsp = (void *) data; + struct hci_conn *hcon = conn->hcon; + u16 mtu, mps, credits, result; + struct l2cap_chan *chan; + int err = 0, sec_level; + int i = 0; + + if (cmd_len < sizeof(*rsp)) + return -EPROTO; + + mtu = __le16_to_cpu(rsp->mtu); + mps = __le16_to_cpu(rsp->mps); + credits = __le16_to_cpu(rsp->credits); + result = __le16_to_cpu(rsp->result); + + BT_DBG("mtu %u mps %u credits %u result 0x%4.4x", mtu, mps, credits, + result); + + mutex_lock(&conn->chan_lock); + + cmd_len -= sizeof(*rsp); + + list_for_each_entry(chan, &conn->chan_l, list) { + u16 dcid; + + if (chan->ident != cmd->ident || + chan->mode != L2CAP_MODE_EXT_FLOWCTL || + chan->state == BT_CONNECTED) + continue; + + l2cap_chan_lock(chan); + + /* Check that there is a dcid for each pending channel */ + if (cmd_len < sizeof(dcid)) { + l2cap_chan_del(chan, ECONNREFUSED); + l2cap_chan_unlock(chan); + continue; + } + + dcid = __le16_to_cpu(rsp->dcid[i++]); + cmd_len -= sizeof(u16); + + BT_DBG("dcid[%d] 0x%4.4x", i, dcid); + + /* Check if dcid is already in use */ + if (dcid && __l2cap_get_chan_by_dcid(conn, dcid)) { + /* If a device receives a + * L2CAP_CREDIT_BASED_CONNECTION_RSP packet with an + * already-assigned Destination CID, then both the + * original channel and the new channel shall be + * immediately discarded and not used. + */ + l2cap_chan_del(chan, ECONNREFUSED); + l2cap_chan_unlock(chan); + chan = __l2cap_get_chan_by_dcid(conn, dcid); + l2cap_chan_lock(chan); + l2cap_chan_del(chan, ECONNRESET); + l2cap_chan_unlock(chan); + continue; + } + + switch (result) { + case L2CAP_CR_LE_AUTHENTICATION: + case L2CAP_CR_LE_ENCRYPTION: + /* If we already have MITM protection we can't do + * anything. + */ + if (hcon->sec_level > BT_SECURITY_MEDIUM) { + l2cap_chan_del(chan, ECONNREFUSED); + break; + } + + sec_level = hcon->sec_level + 1; + if (chan->sec_level < sec_level) + chan->sec_level = sec_level; + + /* We'll need to send a new Connect Request */ + clear_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags); + + smp_conn_security(hcon, chan->sec_level); + break; + + case L2CAP_CR_LE_BAD_PSM: + l2cap_chan_del(chan, ECONNREFUSED); + break; + + default: + /* If dcid was not set it means channels was refused */ + if (!dcid) { + l2cap_chan_del(chan, ECONNREFUSED); + break; + } + + chan->ident = 0; + chan->dcid = dcid; + chan->omtu = mtu; + chan->remote_mps = mps; + chan->tx_credits = credits; + l2cap_chan_ready(chan); + break; + } + + l2cap_chan_unlock(chan); + } + + mutex_unlock(&conn->chan_lock); + + return err; +} + +static inline int l2cap_ecred_reconf_req(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) +{ + struct l2cap_ecred_reconf_req *req = (void *) data; + struct l2cap_ecred_reconf_rsp rsp; + u16 mtu, mps, result; + struct l2cap_chan *chan; + int i, num_scid; + + if (!enable_ecred) + return -EINVAL; + + if (cmd_len < sizeof(*req) || cmd_len - sizeof(*req) % sizeof(u16)) { + result = L2CAP_CR_LE_INVALID_PARAMS; + goto respond; + } + + mtu = __le16_to_cpu(req->mtu); + mps = __le16_to_cpu(req->mps); + + BT_DBG("mtu %u mps %u", mtu, mps); + + if (mtu < L2CAP_ECRED_MIN_MTU) { + result = L2CAP_RECONF_INVALID_MTU; + goto respond; + } + + if (mps < L2CAP_ECRED_MIN_MPS) { + result = L2CAP_RECONF_INVALID_MPS; + goto respond; + } + + cmd_len -= sizeof(*req); + num_scid = cmd_len / sizeof(u16); + result = L2CAP_RECONF_SUCCESS; + + for (i = 0; i < num_scid; i++) { + u16 scid; + + scid = __le16_to_cpu(req->scid[i]); + if (!scid) + return -EPROTO; + + chan = __l2cap_get_chan_by_dcid(conn, scid); + if (!chan) + continue; + + /* If the MTU value is decreased for any of the included + * channels, then the receiver shall disconnect all + * included channels. + */ + if (chan->omtu > mtu) { + BT_ERR("chan %p decreased MTU %u -> %u", chan, + chan->omtu, mtu); + result = L2CAP_RECONF_INVALID_MTU; + } + + chan->omtu = mtu; + chan->remote_mps = mps; + } + +respond: + rsp.result = cpu_to_le16(result); + + l2cap_send_cmd(conn, cmd->ident, L2CAP_ECRED_RECONF_RSP, sizeof(rsp), + &rsp); + + return 0; +} + +static inline int l2cap_ecred_reconf_rsp(struct l2cap_conn *conn, + struct l2cap_cmd_hdr *cmd, u16 cmd_len, + u8 *data) +{ + struct l2cap_chan *chan; + struct l2cap_ecred_conn_rsp *rsp = (void *) data; + u16 result; + + if (cmd_len < sizeof(*rsp)) + return -EPROTO; + + result = __le16_to_cpu(rsp->result); + + BT_DBG("result 0x%4.4x", rsp->result); + + if (!result) + return 0; + + list_for_each_entry(chan, &conn->chan_l, list) { + if (chan->ident != cmd->ident) + continue; + + l2cap_chan_del(chan, ECONNRESET); + } + + return 0; +} + static inline int l2cap_le_command_rej(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data) @@ -5769,6 +6313,22 @@ static inline int l2cap_le_sig_cmd(struct l2cap_conn *conn, err = l2cap_le_credits(conn, cmd, cmd_len, data); break; + case L2CAP_ECRED_CONN_REQ: + err = l2cap_ecred_conn_req(conn, cmd, cmd_len, data); + break; + + case L2CAP_ECRED_CONN_RSP: + err = l2cap_ecred_conn_rsp(conn, cmd, cmd_len, data); + break; + + case L2CAP_ECRED_RECONF_REQ: + err = l2cap_ecred_reconf_req(conn, cmd, cmd_len, data); + break; + + case L2CAP_ECRED_RECONF_RSP: + err = l2cap_ecred_reconf_rsp(conn, cmd, cmd_len, data); + break; + case L2CAP_DISCONN_REQ: err = l2cap_disconnect_req(conn, cmd, cmd_len, data); break; @@ -5831,9 +6391,7 @@ static inline void l2cap_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) { struct hci_conn *hcon = conn->hcon; - u8 *data = skb->data; - int len = skb->len; - struct l2cap_cmd_hdr cmd; + struct l2cap_cmd_hdr *cmd; int err; l2cap_raw_recv(conn, skb); @@ -5841,35 +6399,34 @@ static inline void l2cap_sig_channel(struct l2cap_conn *conn, if (hcon->type != ACL_LINK) goto drop; - while (len >= L2CAP_CMD_HDR_SIZE) { - u16 cmd_len; - memcpy(&cmd, data, L2CAP_CMD_HDR_SIZE); - data += L2CAP_CMD_HDR_SIZE; - len -= L2CAP_CMD_HDR_SIZE; + while (skb->len >= L2CAP_CMD_HDR_SIZE) { + u16 len; + + cmd = (void *) skb->data; + skb_pull(skb, L2CAP_CMD_HDR_SIZE); - cmd_len = le16_to_cpu(cmd.len); + len = le16_to_cpu(cmd->len); - BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd.code, cmd_len, - cmd.ident); + BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd->code, len, + cmd->ident); - if (cmd_len > len || !cmd.ident) { + if (len > skb->len || !cmd->ident) { BT_DBG("corrupted command"); break; } - err = l2cap_bredr_sig_cmd(conn, &cmd, cmd_len, data); + err = l2cap_bredr_sig_cmd(conn, cmd, len, skb->data); if (err) { struct l2cap_cmd_rej_unk rej; BT_ERR("Wrong link type (%d)", err); rej.reason = cpu_to_le16(L2CAP_REJ_NOT_UNDERSTOOD); - l2cap_send_cmd(conn, cmd.ident, L2CAP_COMMAND_REJ, + l2cap_send_cmd(conn, cmd->ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej); } - data += cmd_len; - len -= cmd_len; + skb_pull(skb, len); } drop: @@ -6814,11 +7371,13 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan) struct l2cap_le_credits pkt; u16 return_credits; - return_credits = ((chan->imtu / chan->mps) + 1) - chan->rx_credits; + return_credits = (chan->imtu / chan->mps) + 1; - if (!return_credits) + if (chan->rx_credits >= return_credits) return; + return_credits -= chan->rx_credits; + BT_DBG("chan %p returning %u credits to sender", chan, return_credits); chan->rx_credits += return_credits; @@ -6831,7 +7390,7 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan) l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt); } -static int l2cap_le_recv(struct l2cap_chan *chan, struct sk_buff *skb) +static int l2cap_ecred_recv(struct l2cap_chan *chan, struct sk_buff *skb) { int err; @@ -6846,7 +7405,7 @@ static int l2cap_le_recv(struct l2cap_chan *chan, struct sk_buff *skb) return err; } -static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) +static int l2cap_ecred_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) { int err; @@ -6894,7 +7453,7 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) } if (skb->len == sdu_len) - return l2cap_le_recv(chan, skb); + return l2cap_ecred_recv(chan, skb); chan->sdu = skb; chan->sdu_len = sdu_len; @@ -6926,7 +7485,7 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb) skb = NULL; if (chan->sdu->len == chan->sdu_len) { - err = l2cap_le_recv(chan, chan->sdu); + err = l2cap_ecred_recv(chan, chan->sdu); if (!err) { chan->sdu = NULL; chan->sdu_last_frag = NULL; @@ -6987,7 +7546,8 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid, switch (chan->mode) { case L2CAP_MODE_LE_FLOWCTL: - if (l2cap_le_data_rcv(chan, skb) < 0) + case L2CAP_MODE_EXT_FLOWCTL: + if (l2cap_ecred_data_rcv(chan, skb) < 0) goto drop; goto done; @@ -7206,6 +7766,33 @@ static bool is_valid_psm(u16 psm, u8 dst_type) { return ((psm & 0x0101) == 0x0001); } +struct l2cap_chan_data { + struct l2cap_chan *chan; + struct pid *pid; + int count; +}; + +static void l2cap_chan_by_pid(struct l2cap_chan *chan, void *data) +{ + struct l2cap_chan_data *d = data; + struct pid *pid; + + if (chan == d->chan) + return; + + if (!test_bit(FLAG_DEFER_SETUP, &chan->flags)) + return; + + pid = chan->ops->get_peer_pid(chan); + + /* Only count deferred channels with the same PID/PSM */ + if (d->pid != pid || chan->psm != d->chan->psm || chan->ident || + chan->mode != L2CAP_MODE_EXT_FLOWCTL || chan->state != BT_CONNECT) + return; + + d->count++; +} + int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst, u8 dst_type) { @@ -7214,8 +7801,8 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, struct hci_dev *hdev; int err; - BT_DBG("%pMR -> %pMR (type %u) psm 0x%2.2x", &chan->src, dst, - dst_type, __le16_to_cpu(psm)); + BT_DBG("%pMR -> %pMR (type %u) psm 0x%4.4x mode 0x%2.2x", &chan->src, + dst, dst_type, __le16_to_cpu(psm), chan->mode); hdev = hci_get_route(dst, &chan->src, chan->src_type); if (!hdev) @@ -7244,6 +7831,12 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, break; case L2CAP_MODE_LE_FLOWCTL: break; + case L2CAP_MODE_EXT_FLOWCTL: + if (!enable_ecred) { + err = -EOPNOTSUPP; + goto done; + } + break; case L2CAP_MODE_ERTM: case L2CAP_MODE_STREAMING: if (!disable_ertm) @@ -7319,6 +7912,23 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, goto done; } + if (chan->mode == L2CAP_MODE_EXT_FLOWCTL) { + struct l2cap_chan_data data; + + data.chan = chan; + data.pid = chan->ops->get_peer_pid(chan); + data.count = 1; + + l2cap_chan_list(conn, l2cap_chan_by_pid, &data); + + /* Check if there isn't too many channels being connected */ + if (data.count > L2CAP_ECRED_CONN_SCID_MAX) { + hci_conn_drop(hcon); + err = -EPROTO; + goto done; + } + } + mutex_lock(&conn->chan_lock); l2cap_chan_lock(chan); @@ -7368,6 +7978,38 @@ done: } EXPORT_SYMBOL_GPL(l2cap_chan_connect); +static void l2cap_ecred_reconfigure(struct l2cap_chan *chan) +{ + struct l2cap_conn *conn = chan->conn; + struct { + struct l2cap_ecred_reconf_req req; + __le16 scid; + } pdu; + + pdu.req.mtu = cpu_to_le16(chan->imtu); + pdu.req.mps = cpu_to_le16(chan->mps); + pdu.scid = cpu_to_le16(chan->scid); + + chan->ident = l2cap_get_ident(conn); + + l2cap_send_cmd(conn, chan->ident, L2CAP_ECRED_RECONF_REQ, + sizeof(pdu), &pdu); +} + +int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu) +{ + if (chan->imtu > mtu) + return -EINVAL; + + BT_DBG("chan %p mtu 0x%4.4x", chan, mtu); + + chan->imtu = mtu; + + l2cap_ecred_reconfigure(chan); + + return 0; +} + /* ---- L2CAP interface with lower layer (HCI) ---- */ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr) @@ -7579,7 +8221,8 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) else __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); } else if (chan->state == BT_CONNECT2 && - chan->mode != L2CAP_MODE_LE_FLOWCTL) { + !(chan->mode == L2CAP_MODE_EXT_FLOWCTL || + chan->mode == L2CAP_MODE_LE_FLOWCTL)) { struct l2cap_conn_rsp rsp; __u16 res, stat; @@ -7787,3 +8430,6 @@ void l2cap_exit(void) module_param(disable_ertm, bool, 0644); MODULE_PARM_DESC(disable_ertm, "Disable enhanced retransmission mode"); + +module_param(enable_ecred, bool, 0644); +MODULE_PARM_DESC(enable_ecred, "Enable enhanced credit flow control mode"); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index a7be8b59b3c2..117ba20ea194 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -232,7 +232,7 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, return -EINVAL; } - if (chan->psm && bdaddr_type_is_le(chan->src_type)) + if (chan->psm && bdaddr_type_is_le(chan->src_type) && !chan->mode) chan->mode = L2CAP_MODE_LE_FLOWCTL; err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid), @@ -274,6 +274,12 @@ static int l2cap_sock_listen(struct socket *sock, int backlog) case L2CAP_MODE_BASIC: case L2CAP_MODE_LE_FLOWCTL: break; + case L2CAP_MODE_EXT_FLOWCTL: + if (!enable_ecred) { + err = -EOPNOTSUPP; + goto done; + } + break; case L2CAP_MODE_ERTM: case L2CAP_MODE_STREAMING: if (!disable_ertm) @@ -427,6 +433,8 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, opts.max_tx = chan->max_tx; opts.txwin_size = chan->tx_win; + BT_DBG("mode 0x%2.2x", chan->mode); + len = min_t(unsigned int, len, sizeof(opts)); if (copy_to_user(optval, (char *) &opts, len)) err = -EFAULT; @@ -499,6 +507,7 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct bt_security sec; struct bt_power pwr; + u32 phys; int len, err = 0; BT_DBG("sk %p", sk); @@ -603,6 +612,18 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, err = -EFAULT; break; + case BT_PHY: + if (sk->sk_state != BT_CONNECTED) { + err = -ENOTCONN; + break; + } + + phys = hci_conn_get_phy(chan->conn->hcon); + + if (put_user(phys, (u32 __user *) optval)) + err = -EFAULT; + break; + default: err = -ENOPROTOOPT; break; @@ -694,6 +715,8 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; } + BT_DBG("mode 0x%2.2x", chan->mode); + chan->imtu = opts.imtu; chan->omtu = opts.omtu; chan->fcs = opts.fcs; @@ -926,7 +949,8 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (sk->sk_state == BT_CONNECTED) { + if (chan->mode == L2CAP_MODE_LE_FLOWCTL && + sk->sk_state == BT_CONNECTED) { err = -EISCONN; break; } @@ -936,7 +960,12 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; } - chan->imtu = opt; + if (chan->mode == L2CAP_MODE_EXT_FLOWCTL && + sk->sk_state == BT_CONNECTED) + err = l2cap_chan_reconfigure(chan, opt); + else + chan->imtu = opt; + break; default: @@ -991,7 +1020,11 @@ static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg, if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { - if (bdaddr_type_is_le(pi->chan->src_type)) { + if (pi->chan->mode == L2CAP_MODE_EXT_FLOWCTL) { + sk->sk_state = BT_CONNECTED; + pi->chan->state = BT_CONNECTED; + __l2cap_ecred_conn_rsp_defer(pi->chan); + } else if (bdaddr_type_is_le(pi->chan->src_type)) { sk->sk_state = BT_CONNECTED; pi->chan->state = BT_CONNECTED; __l2cap_le_connect_rsp_defer(pi->chan); @@ -1042,7 +1075,7 @@ done: } /* Kill socket (only if zapped and orphan) - * Must be called on unlocked socket. + * Must be called on unlocked socket, with l2cap channel lock. */ static void l2cap_sock_kill(struct sock *sk) { @@ -1193,6 +1226,7 @@ static int l2cap_sock_release(struct socket *sock) { struct sock *sk = sock->sk; int err; + struct l2cap_chan *chan; BT_DBG("sock %p, sk %p", sock, sk); @@ -1202,9 +1236,17 @@ static int l2cap_sock_release(struct socket *sock) bt_sock_unlink(&l2cap_sk_list, sk); err = l2cap_sock_shutdown(sock, 2); + chan = l2cap_pi(sk)->chan; + + l2cap_chan_hold(chan); + l2cap_chan_lock(chan); sock_orphan(sk); l2cap_sock_kill(sk); + + l2cap_chan_unlock(chan); + l2cap_chan_put(chan); + return err; } @@ -1222,12 +1264,15 @@ static void l2cap_sock_cleanup_listen(struct sock *parent) BT_DBG("child chan %p state %s", chan, state_to_string(chan->state)); + l2cap_chan_hold(chan); l2cap_chan_lock(chan); + __clear_chan_timer(chan); l2cap_chan_close(chan, ECONNRESET); - l2cap_chan_unlock(chan); - l2cap_sock_kill(sk); + + l2cap_chan_unlock(chan); + l2cap_chan_put(chan); } } @@ -1459,6 +1504,13 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan) return sk->sk_sndtimeo; } +static struct pid *l2cap_sock_get_peer_pid_cb(struct l2cap_chan *chan) +{ + struct sock *sk = chan->data; + + return sk->sk_peer_pid; +} + static void l2cap_sock_suspend_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; @@ -1480,6 +1532,7 @@ static const struct l2cap_ops l2cap_chan_ops = { .suspend = l2cap_sock_suspend_cb, .set_shutdown = l2cap_sock_set_shutdown_cb, .get_sndtimeo = l2cap_sock_get_sndtimeo_cb, + .get_peer_pid = l2cap_sock_get_peer_pid_cb, .alloc_skb = l2cap_sock_alloc_skb_cb, }; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 3074363c68df..6552003a170e 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 15 +#define MGMT_REVISION 16 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -107,6 +107,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_READ_EXT_INFO, MGMT_OP_SET_APPEARANCE, MGMT_OP_SET_BLOCKED_KEYS, + MGMT_OP_SET_WIDEBAND_SPEECH, }; static const u16 mgmt_events[] = { @@ -762,6 +763,10 @@ static u32 get_supported_settings(struct hci_dev *hdev) if (lmp_sc_capable(hdev)) settings |= MGMT_SETTING_SECURE_CONN; + + if (test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, + &hdev->quirks)) + settings |= MGMT_SETTING_WIDEBAND_SPEECH; } if (lmp_le_capable(hdev)) { @@ -846,6 +851,9 @@ static u32 get_current_settings(struct hci_dev *hdev) settings |= MGMT_SETTING_STATIC_ADDRESS; } + if (hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) + settings |= MGMT_SETTING_WIDEBAND_SPEECH; + return settings; } @@ -1382,6 +1390,12 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } + if (hdev->advertising_paused) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, + MGMT_STATUS_BUSY); + goto failed; + } + if (!hdev_is_powered(hdev)) { bool changed = false; @@ -3589,6 +3603,62 @@ static int set_blocked_keys(struct sock *sk, struct hci_dev *hdev, void *data, err, NULL, 0); } +static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_mode *cp = data; + int err; + bool changed = false; + + BT_DBG("request for %s", hdev->name); + + if (!test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks)) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_WIDEBAND_SPEECH, + MGMT_STATUS_NOT_SUPPORTED); + + if (cp->val != 0x00 && cp->val != 0x01) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_WIDEBAND_SPEECH, + MGMT_STATUS_INVALID_PARAMS); + + hci_dev_lock(hdev); + + if (pending_find(MGMT_OP_SET_WIDEBAND_SPEECH, hdev)) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_WIDEBAND_SPEECH, + MGMT_STATUS_BUSY); + goto unlock; + } + + if (hdev_is_powered(hdev) && + !!cp->val != hci_dev_test_flag(hdev, + HCI_WIDEBAND_SPEECH_ENABLED)) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_WIDEBAND_SPEECH, + MGMT_STATUS_REJECTED); + goto unlock; + } + + if (cp->val) + changed = !hci_dev_test_and_set_flag(hdev, + HCI_WIDEBAND_SPEECH_ENABLED); + else + changed = hci_dev_test_and_clear_flag(hdev, + HCI_WIDEBAND_SPEECH_ENABLED); + + err = send_settings_rsp(sk, MGMT_OP_SET_WIDEBAND_SPEECH, hdev); + if (err < 0) + goto unlock; + + if (changed) + err = new_settings(hdev, sk); + +unlock: + hci_dev_unlock(hdev); + return err; +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -3865,6 +3935,13 @@ void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status) } hci_dev_unlock(hdev); + + /* Handle suspend notifier */ + if (test_and_clear_bit(SUSPEND_UNPAUSE_DISCOVERY, + hdev->suspend_tasks)) { + bt_dev_dbg(hdev, "Unpaused discovery"); + wake_up(&hdev->suspend_wait_q); + } } static bool discovery_type_is_valid(struct hci_dev *hdev, uint8_t type, @@ -3926,6 +4003,13 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, goto failed; } + /* Can't start discovery when it is paused */ + if (hdev->discovery_paused) { + err = mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_BUSY, + &cp->type, sizeof(cp->type)); + goto failed; + } + /* Clear the discovery filter first to free any previously * allocated memory for the UUID list. */ @@ -4093,6 +4177,12 @@ void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status) } hci_dev_unlock(hdev); + + /* Handle suspend notifier */ + if (test_and_clear_bit(SUSPEND_PAUSE_DISCOVERY, hdev->suspend_tasks)) { + bt_dev_dbg(hdev, "Paused discovery"); + wake_up(&hdev->suspend_wait_q); + } } static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, @@ -4324,6 +4414,17 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status, if (match.sk) sock_put(match.sk); + /* Handle suspend notifier */ + if (test_and_clear_bit(SUSPEND_PAUSE_ADVERTISING, + hdev->suspend_tasks)) { + bt_dev_dbg(hdev, "Paused advertising"); + wake_up(&hdev->suspend_wait_q); + } else if (test_and_clear_bit(SUSPEND_UNPAUSE_ADVERTISING, + hdev->suspend_tasks)) { + bt_dev_dbg(hdev, "Unpaused advertising"); + wake_up(&hdev->suspend_wait_q); + } + /* If "Set Advertising" was just disabled and instance advertising was * set up earlier, then re-enable multi-instance advertising. */ @@ -4375,6 +4476,10 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); + if (hdev->advertising_paused) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + MGMT_STATUS_BUSY); + hci_dev_lock(hdev); val = !!cp->val; @@ -6743,8 +6848,11 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, if (!err) err = hci_req_run(&req, add_advertising_complete); - if (err < 0) + if (err < 0) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + MGMT_STATUS_FAILED); mgmt_pending_remove(cmd); + } unlock: hci_dev_unlock(hdev); @@ -6990,6 +7098,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { set_phy_configuration, MGMT_SET_PHY_CONFIGURATION_SIZE }, { set_blocked_keys, MGMT_OP_SET_BLOCKED_KEYS_SIZE, HCI_MGMT_VAR_LEN }, + { set_wideband_speech, MGMT_SETTING_SIZE }, }; void mgmt_index_added(struct hci_dev *hdev) diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 3a9e9d9670be..2e20af317cea 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -40,7 +40,6 @@ static bool disable_cfc; static bool l2cap_ertm; static int channel_mtu = -1; -static unsigned int l2cap_mtu = RFCOMM_MAX_L2CAP_MTU; static struct task_struct *rfcomm_thread; @@ -73,8 +72,6 @@ static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s); /* ---- RFCOMM frame parsing macros ---- */ #define __get_dlci(b) ((b & 0xfc) >> 2) -#define __get_channel(b) ((b & 0xf8) >> 3) -#define __get_dir(b) ((b & 0x04) >> 2) #define __get_type(b) ((b & 0xef)) #define __test_ea(b) ((b & 0x01)) @@ -87,7 +84,6 @@ static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s); #define __ctrl(type, pf) (((type & 0xef) | (pf << 4))) #define __dlci(dir, chn) (((chn & 0x1f) << 1) | dir) #define __srv_channel(dlci) (dlci >> 1) -#define __dir(dlci) (dlci & 0x01) #define __len8(len) (((len) << 1) | 1) #define __len16(len) ((len) << 1) @@ -752,7 +748,8 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, /* Set L2CAP options */ sk = sock->sk; lock_sock(sk); - l2cap_pi(sk)->chan->imtu = l2cap_mtu; + /* Set MTU to 0 so L2CAP can auto select the MTU */ + l2cap_pi(sk)->chan->imtu = 0; l2cap_pi(sk)->chan->sec_level = sec_level; if (l2cap_ertm) l2cap_pi(sk)->chan->mode = L2CAP_MODE_ERTM; @@ -2039,7 +2036,8 @@ static int rfcomm_add_listener(bdaddr_t *ba) /* Set L2CAP options */ sk = sock->sk; lock_sock(sk); - l2cap_pi(sk)->chan->imtu = l2cap_mtu; + /* Set MTU to 0 so L2CAP can auto select the MTU */ + l2cap_pi(sk)->chan->imtu = 0; release_sock(sk); /* Start listening on the socket */ @@ -2237,9 +2235,6 @@ MODULE_PARM_DESC(disable_cfc, "Disable credit based flow control"); module_param(channel_mtu, int, 0644); MODULE_PARM_DESC(channel_mtu, "Default MTU for the RFCOMM channel"); -module_param(l2cap_mtu, uint, 0644); -MODULE_PARM_DESC(l2cap_mtu, "Default MTU for the L2CAP connection"); - module_param(l2cap_ertm, bool, 0644); MODULE_PARM_DESC(l2cap_ertm, "Use L2CAP ERTM mode for connection"); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 0c7d31c6c18c..a58584949a95 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -413,10 +413,8 @@ static int __rfcomm_create_dev(struct sock *sk, void __user *arg) dlc = rfcomm_dlc_exists(&req.src, &req.dst, req.channel); if (IS_ERR(dlc)) return PTR_ERR(dlc); - else if (dlc) { - rfcomm_dlc_put(dlc); + if (dlc) return -EBUSY; - } dlc = rfcomm_dlc_alloc(GFP_KERNEL); if (!dlc) return -ENOMEM; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index b91d6b440fdf..c8c3d38cdc7b 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -922,6 +922,7 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, struct sock *sk = sock->sk; int len, err = 0; struct bt_voice voice; + u32 phys; BT_DBG("sk %p", sk); @@ -956,6 +957,18 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, break; + case BT_PHY: + if (sk->sk_state != BT_CONNECTED) { + err = -ENOTCONN; + break; + } + + phys = hci_conn_get_phy(sco_pi(sk)->conn->hcon); + + if (put_user(phys, (u32 __user *) optval)) + err = -EFAULT; + break; + default: err = -ENOPROTOOPT; break; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 204f14f8b507..1476a91ce935 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -1145,7 +1145,7 @@ static void sc_generate_link_key(struct smp_chan *smp) return; if (test_bit(SMP_FLAG_CT2, &smp->flags)) { - /* SALT = 0x00000000000000000000000000000000746D7031 */ + /* SALT = 0x000000000000000000000000746D7031 */ const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 }; if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) { @@ -1203,7 +1203,7 @@ static void sc_generate_ltk(struct smp_chan *smp) set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); if (test_bit(SMP_FLAG_CT2, &smp->flags)) { - /* SALT = 0x00000000000000000000000000000000746D7032 */ + /* SALT = 0x000000000000000000000000746D7032 */ const u8 salt[16] = { 0x32, 0x70, 0x6d, 0x74 }; if (smp_h7(smp->tfm_cmac, key->val, salt, smp->tk)) @@ -2115,7 +2115,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; - u8 *pkax, *pkbx, *na, *nb; + u8 *pkax, *pkbx, *na, *nb, confirm_hint; u32 passkey; int err; @@ -2168,6 +2168,24 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); + + /* Only Just-Works pairing requires extra checks */ + if (smp->method != JUST_WORKS) + goto mackey_and_ltk; + + /* If there already exists long term key in local host, leave + * the decision to user space since the remote device could + * be legitimate or malicious. + */ + if (hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, + hcon->role)) { + /* Set passkey to 0. The value can be any number since + * it'll be ignored anyway. + */ + passkey = 0; + confirm_hint = 1; + goto confirm; + } } mackey_and_ltk: @@ -2188,8 +2206,11 @@ mackey_and_ltk: if (err) return SMP_UNSPECIFIED; + confirm_hint = 0; + +confirm: err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, - hcon->dst_type, passkey, 0); + hcon->dst_type, passkey, confirm_hint); if (err) return SMP_UNSPECIFIED; diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c index 77396a098fbe..efea4874743e 100644 --- a/net/bpfilter/main.c +++ b/net/bpfilter/main.c @@ -10,7 +10,7 @@ #include <asm/unistd.h> #include "msgfmt.h" -int debug_fd; +FILE *debug_f; static int handle_get_cmd(struct mbox_request *cmd) { @@ -35,9 +35,10 @@ static void loop(void) struct mbox_reply reply; int n; + fprintf(debug_f, "testing the buffer\n"); n = read(0, &req, sizeof(req)); if (n != sizeof(req)) { - dprintf(debug_fd, "invalid request %d\n", n); + fprintf(debug_f, "invalid request %d\n", n); return; } @@ -47,7 +48,7 @@ static void loop(void) n = write(1, &reply, sizeof(reply)); if (n != sizeof(reply)) { - dprintf(debug_fd, "reply failed %d\n", n); + fprintf(debug_f, "reply failed %d\n", n); return; } } @@ -55,9 +56,10 @@ static void loop(void) int main(void) { - debug_fd = open("/dev/kmsg", 00000002); - dprintf(debug_fd, "Started bpfilter\n"); + debug_f = fopen("/dev/kmsg", "w"); + setvbuf(debug_f, 0, _IOLBF, 0); + fprintf(debug_f, "Started bpfilter\n"); loop(); - close(debug_fd); + fclose(debug_f); return 0; } diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c index afee292fb004..162998e2f039 100644 --- a/net/bridge/br_netlink_tunnel.c +++ b/net/bridge/br_netlink_tunnel.c @@ -26,8 +26,8 @@ static size_t __get_vlan_tinfo_size(void) nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */ } -static bool vlan_tunid_inrange(struct net_bridge_vlan *v_curr, - struct net_bridge_vlan *v_last) +bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *v_last) { __be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id); __be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id); @@ -193,8 +193,8 @@ static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + [IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 }, }; -static int br_vlan_tunnel_info(struct net_bridge_port *p, int cmd, - u16 vid, u32 tun_id, bool *changed) +int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd, + u16 vid, u32 tun_id, bool *changed) { int err = 0; @@ -250,8 +250,8 @@ int br_parse_vlan_tunnel_info(struct nlattr *attr, return 0; } -int br_process_vlan_tunnel_info(struct net_bridge *br, - struct net_bridge_port *p, int cmd, +int br_process_vlan_tunnel_info(const struct net_bridge *br, + const struct net_bridge_port *p, int cmd, struct vtunnel_info *tinfo_curr, struct vtunnel_info *tinfo_last, bool *changed) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 5153ffe79a01..1f97703a52ff 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1199,8 +1199,8 @@ static inline void br_vlan_notify(const struct net_bridge *br, /* br_vlan_options.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING -bool br_vlan_opts_eq(const struct net_bridge_vlan *v1, - const struct net_bridge_vlan *v2); +bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end); bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v); size_t br_vlan_opts_nl_size(void); int br_vlan_process_options(const struct net_bridge *br, diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h index 2bdef2ea3420..c54cc26211d7 100644 --- a/net/bridge/br_private_tunnel.h +++ b/net/bridge/br_private_tunnel.h @@ -18,8 +18,8 @@ struct vtunnel_info { /* br_netlink_tunnel.c */ int br_parse_vlan_tunnel_info(struct nlattr *attr, struct vtunnel_info *tinfo); -int br_process_vlan_tunnel_info(struct net_bridge *br, - struct net_bridge_port *p, +int br_process_vlan_tunnel_info(const struct net_bridge *br, + const struct net_bridge_port *p, int cmd, struct vtunnel_info *tinfo_curr, struct vtunnel_info *tinfo_last, @@ -32,8 +32,9 @@ int br_fill_vlan_tunnel_info(struct sk_buff *skb, /* br_vlan_tunnel.c */ int vlan_tunnel_init(struct net_bridge_vlan_group *vg); void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg); -int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid); -int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id); +int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid); +int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid, + u32 tun_id); void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port); void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan); @@ -42,19 +43,23 @@ int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan_group *vg); int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan); +bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *v_last); +int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd, + u16 vid, u32 tun_id, bool *changed); #else static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg) { return 0; } -static inline int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, +static inline int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid) { return 0; } -static inline int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, +static inline int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid, u32 tun_id) { return 0; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 6b5deca08b89..f9092c71225f 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1569,10 +1569,41 @@ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) } } +static bool br_vlan_stats_fill(struct sk_buff *skb, + const struct net_bridge_vlan *v) +{ + struct br_vlan_stats stats; + struct nlattr *nest; + + nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS); + if (!nest) + return false; + + br_vlan_get_stats(v, &stats); + if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES, stats.rx_bytes, + BRIDGE_VLANDB_STATS_PAD) || + nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_PACKETS, + stats.rx_packets, BRIDGE_VLANDB_STATS_PAD) || + nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES, stats.tx_bytes, + BRIDGE_VLANDB_STATS_PAD) || + nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_PACKETS, + stats.tx_packets, BRIDGE_VLANDB_STATS_PAD)) + goto out_err; + + nla_nest_end(skb, nest); + + return true; + +out_err: + nla_nest_cancel(skb, nest); + return false; +} + /* v_opts is used to dump the options which must be equal in the whole range */ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts, - u16 flags) + u16 flags, + bool dump_stats) { struct bridge_vlan_info info; struct nlattr *nest; @@ -1596,8 +1627,13 @@ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range)) goto out_err; - if (v_opts && !br_vlan_opts_fill(skb, v_opts)) - goto out_err; + if (v_opts) { + if (!br_vlan_opts_fill(skb, v_opts)) + goto out_err; + + if (dump_stats && !br_vlan_stats_fill(skb, v_opts)) + goto out_err; + } nla_nest_end(skb, nest); @@ -1675,7 +1711,7 @@ void br_vlan_notify(const struct net_bridge *br, goto out_kfree; } - if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags)) + if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags, false)) goto out_err; nlmsg_end(skb, nlh); @@ -1694,14 +1730,16 @@ bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, { return v_curr->vid - range_end->vid == 1 && range_end->flags == v_curr->flags && - br_vlan_opts_eq(v_curr, range_end); + br_vlan_opts_eq_range(v_curr, range_end); } static int br_vlan_dump_dev(const struct net_device *dev, struct sk_buff *skb, - struct netlink_callback *cb) + struct netlink_callback *cb, + u32 dump_flags) { struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL; + bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS); struct net_bridge_vlan_group *vg; int idx = 0, s_idx = cb->args[1]; struct nlmsghdr *nlh = NULL; @@ -1754,12 +1792,13 @@ static int br_vlan_dump_dev(const struct net_device *dev, continue; } - if (v->vid == pvid || !br_vlan_can_enter_range(v, range_end)) { - u16 flags = br_vlan_flags(range_start, pvid); + if (dump_stats || v->vid == pvid || + !br_vlan_can_enter_range(v, range_end)) { + u16 vlan_flags = br_vlan_flags(range_start, pvid); if (!br_vlan_fill_vids(skb, range_start->vid, range_end->vid, range_start, - flags)) { + vlan_flags, dump_stats)) { err = -EMSGSIZE; break; } @@ -1778,7 +1817,8 @@ static int br_vlan_dump_dev(const struct net_device *dev, */ if (!err && range_start && !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, - range_start, br_vlan_flags(range_start, pvid))) + range_start, br_vlan_flags(range_start, pvid), + dump_stats)) err = -EMSGSIZE; cb->args[1] = err ? idx : 0; @@ -1788,18 +1828,27 @@ static int br_vlan_dump_dev(const struct net_device *dev, return err; } +static const struct nla_policy br_vlan_db_dump_pol[BRIDGE_VLANDB_DUMP_MAX + 1] = { + [BRIDGE_VLANDB_DUMP_FLAGS] = { .type = NLA_U32 }, +}; + static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct nlattr *dtb[BRIDGE_VLANDB_DUMP_MAX + 1]; int idx = 0, err = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); struct br_vlan_msg *bvm; struct net_device *dev; + u32 dump_flags = 0; - err = nlmsg_parse(cb->nlh, sizeof(*bvm), NULL, 0, NULL, cb->extack); + err = nlmsg_parse(cb->nlh, sizeof(*bvm), dtb, BRIDGE_VLANDB_DUMP_MAX, + br_vlan_db_dump_pol, cb->extack); if (err < 0) return err; bvm = nlmsg_data(cb->nlh); + if (dtb[BRIDGE_VLANDB_DUMP_FLAGS]) + dump_flags = nla_get_u32(dtb[BRIDGE_VLANDB_DUMP_FLAGS]); rcu_read_lock(); if (bvm->ifindex) { @@ -1808,7 +1857,7 @@ static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) err = -ENODEV; goto out_err; } - err = br_vlan_dump_dev(dev, skb, cb); + err = br_vlan_dump_dev(dev, skb, cb, dump_flags); if (err && err != -EMSGSIZE) goto out_err; } else { @@ -1816,7 +1865,7 @@ static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) if (idx < s_idx) goto skip; - err = br_vlan_dump_dev(dev, skb, cb); + err = br_vlan_dump_dev(dev, skb, cb, dump_flags); if (err == -EMSGSIZE) break; skip: @@ -1839,6 +1888,7 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = .len = sizeof(struct bridge_vlan_info) }, [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, + [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED }, }; static int br_vlan_rtm_process_one(struct net_device *dev, diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c index cd2eb194eb98..b4add9ea8964 100644 --- a/net/bridge/br_vlan_options.c +++ b/net/bridge/br_vlan_options.c @@ -4,25 +4,58 @@ #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> +#include <net/ip_tunnels.h> #include "br_private.h" +#include "br_private_tunnel.h" -/* check if the options between two vlans are equal */ -bool br_vlan_opts_eq(const struct net_bridge_vlan *v1, - const struct net_bridge_vlan *v2) +static bool __vlan_tun_put(struct sk_buff *skb, const struct net_bridge_vlan *v) { - return v1->state == v2->state; + __be32 tid = tunnel_id_to_key32(v->tinfo.tunnel_id); + struct nlattr *nest; + + if (!v->tinfo.tunnel_dst) + return true; + + nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_TUNNEL_INFO); + if (!nest) + return false; + if (nla_put_u32(skb, BRIDGE_VLANDB_TINFO_ID, be32_to_cpu(tid))) { + nla_nest_cancel(skb, nest); + return false; + } + nla_nest_end(skb, nest); + + return true; +} + +static bool __vlan_tun_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end) +{ + return (!v_curr->tinfo.tunnel_dst && !range_end->tinfo.tunnel_dst) || + vlan_tunid_inrange(v_curr, range_end); +} + +/* check if the options' state of v_curr allow it to enter the range */ +bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *range_end) +{ + return v_curr->state == range_end->state && + __vlan_tun_can_enter_range(v_curr, range_end); } bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) { return !nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, - br_vlan_get_state(v)); + br_vlan_get_state(v)) && + __vlan_tun_put(skb, v); } size_t br_vlan_opts_nl_size(void) { - return nla_total_size(sizeof(u8)); /* BRIDGE_VLANDB_ENTRY_STATE */ + return nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_STATE */ + + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY_TUNNEL_INFO */ + + nla_total_size(sizeof(u32)); /* BRIDGE_VLANDB_TINFO_ID */ } static int br_vlan_modify_state(struct net_bridge_vlan_group *vg, @@ -62,6 +95,68 @@ static int br_vlan_modify_state(struct net_bridge_vlan_group *vg, return 0; } +static const struct nla_policy br_vlandb_tinfo_pol[BRIDGE_VLANDB_TINFO_MAX + 1] = { + [BRIDGE_VLANDB_TINFO_ID] = { .type = NLA_U32 }, + [BRIDGE_VLANDB_TINFO_CMD] = { .type = NLA_U32 }, +}; + +static int br_vlan_modify_tunnel(const struct net_bridge_port *p, + struct net_bridge_vlan *v, + struct nlattr **tb, + bool *changed, + struct netlink_ext_ack *extack) +{ + struct nlattr *tun_tb[BRIDGE_VLANDB_TINFO_MAX + 1], *attr; + struct bridge_vlan_info *vinfo; + u32 tun_id = 0; + int cmd, err; + + if (!p) { + NL_SET_ERR_MSG_MOD(extack, "Can't modify tunnel mapping of non-port vlans"); + return -EINVAL; + } + if (!(p->flags & BR_VLAN_TUNNEL)) { + NL_SET_ERR_MSG_MOD(extack, "Port doesn't have tunnel flag set"); + return -EINVAL; + } + + attr = tb[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO]; + err = nla_parse_nested(tun_tb, BRIDGE_VLANDB_TINFO_MAX, attr, + br_vlandb_tinfo_pol, extack); + if (err) + return err; + + if (!tun_tb[BRIDGE_VLANDB_TINFO_CMD]) { + NL_SET_ERR_MSG_MOD(extack, "Missing tunnel command attribute"); + return -ENOENT; + } + cmd = nla_get_u32(tun_tb[BRIDGE_VLANDB_TINFO_CMD]); + switch (cmd) { + case RTM_SETLINK: + if (!tun_tb[BRIDGE_VLANDB_TINFO_ID]) { + NL_SET_ERR_MSG_MOD(extack, "Missing tunnel id attribute"); + return -ENOENT; + } + /* when working on vlan ranges this is the starting tunnel id */ + tun_id = nla_get_u32(tun_tb[BRIDGE_VLANDB_TINFO_ID]); + /* vlan info attr is guaranteed by br_vlan_rtm_process_one */ + vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); + /* tunnel ids are mapped to each vlan in increasing order, + * the starting vlan is in BRIDGE_VLANDB_ENTRY_INFO and v is the + * current vlan, so we compute: tun_id + v - vinfo->vid + */ + tun_id += v->vid - vinfo->vid; + break; + case RTM_DELLINK: + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel command"); + return -EINVAL; + } + + return br_vlan_tunnel_info(p, cmd, v->vid, tun_id, changed); +} + static int br_vlan_process_one_opts(const struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg, @@ -80,6 +175,11 @@ static int br_vlan_process_one_opts(const struct net_bridge *br, if (err) return err; } + if (tb[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO]) { + err = br_vlan_modify_tunnel(p, v, tb, changed, extack); + if (err) + return err; + } return 0; } diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index d13d2080f527..169e005fbda2 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -89,7 +89,8 @@ out: /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ -int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id) +int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid, + u32 tun_id) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; @@ -107,7 +108,7 @@ int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id) /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ -int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid) +int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index e1256e03a9a8..78db58c7aec2 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1561,7 +1561,7 @@ struct compat_ebt_entry_mwt { compat_uptr_t ptr; } u; compat_uint_t match_size; - compat_uint_t data[0] __attribute__ ((aligned (__alignof__(struct compat_ebt_replace)))); + compat_uint_t data[] __aligned(__alignof__(struct compat_ebt_replace)); }; /* account for possible padding between match_size and ->data */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 5b4bd8261002..f8ca5edc5f2c 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3248,12 +3248,16 @@ static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg) static void ceph_msg_data_destroy(struct ceph_msg_data *data) { - if (data->type == CEPH_MSG_DATA_PAGELIST) + if (data->type == CEPH_MSG_DATA_PAGES && data->own_pages) { + int num_pages = calc_pages_for(data->alignment, data->length); + ceph_release_page_vector(data->pages, num_pages); + } else if (data->type == CEPH_MSG_DATA_PAGELIST) { ceph_pagelist_release(data->pagelist); + } } void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, - size_t length, size_t alignment) + size_t length, size_t alignment, bool own_pages) { struct ceph_msg_data *data; @@ -3265,6 +3269,7 @@ void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, data->pages = pages; data->length = length; data->alignment = alignment & ~PAGE_MASK; + data->own_pages = own_pages; msg->data_length += length; } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b68b376d8c2f..af868d3923b9 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -962,7 +962,7 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, BUG_ON(length > (u64) SIZE_MAX); if (length) ceph_msg_data_add_pages(msg, osd_data->pages, - length, osd_data->alignment); + length, osd_data->alignment, false); } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { BUG_ON(!length); ceph_msg_data_add_pagelist(msg, osd_data->pagelist); @@ -4436,9 +4436,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc, CEPH_MSG_DATA_PAGES); *lreq->preply_pages = data->pages; *lreq->preply_len = data->length; - } else { - ceph_release_page_vector(data->pages, - calc_pages_for(0, data->length)); + data->own_pages = false; } } lreq->notify_finish_error = return_code; @@ -5506,9 +5504,6 @@ out_unlock_osdc: return m; } -/* - * TODO: switch to a msg-owned pagelist - */ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) { struct ceph_msg *m; @@ -5522,7 +5517,6 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) if (data_len) { struct page **pages; - struct ceph_osd_data osd_data; pages = ceph_alloc_page_vector(calc_pages_for(0, data_len), GFP_NOIO); @@ -5531,9 +5525,7 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) return NULL; } - ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false, - false); - ceph_osdc_msg_data_add(m, &osd_data); + ceph_msg_data_add_pages(m, pages, data_len, 0, true); } return m; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4e0de14f80bb..2a6e63a8edbe 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -710,6 +710,15 @@ int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) } EXPORT_SYMBOL(ceph_pg_poolid_by_name); +u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id) +{ + struct ceph_pg_pool_info *pi; + + pi = __lookup_pg_pool(&map->pg_pools, id); + return pi ? pi->flags : 0; +} +EXPORT_SYMBOL(ceph_pg_pool_flags); + static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) { rb_erase(&pi->node, root); diff --git a/net/core/datagram.c b/net/core/datagram.c index 4213081c6ed3..639745d4f3b9 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -51,6 +51,7 @@ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/uio.h> +#include <linux/indirect_call_wrapper.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -403,6 +404,11 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) } EXPORT_SYMBOL(skb_kill_datagram); +INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr, + size_t bytes, + void *data __always_unused, + struct iov_iter *i)); + static int __skb_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, bool fault_short, size_t (*cb)(const void *, size_t, void *, @@ -416,7 +422,8 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - n = cb(skb->data + offset, copy, data, to); + n = INDIRECT_CALL_1(cb, simple_copy_to_iter, + skb->data + offset, copy, data, to); offset += n; if (n != copy) goto short_copy; @@ -438,8 +445,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, if (copy > len) copy = len; - n = cb(vaddr + skb_frag_off(frag) + offset - start, - copy, data, to); + n = INDIRECT_CALL_1(cb, simple_copy_to_iter, + vaddr + skb_frag_off(frag) + offset - start, + copy, data, to); kunmap(page); offset += n; if (n != copy) diff --git a/net/core/dev.c b/net/core/dev.c index 651a3c28d33a..9c9e763bfe0e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3266,7 +3266,7 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. * - * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. + * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) @@ -3295,7 +3295,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, features &= ~NETIF_F_GSO_PARTIAL; } - BUILD_BUG_ON(SKB_SGO_CB_OFFSET + + BUILD_BUG_ON(SKB_GSO_CB_OFFSET + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); @@ -4516,7 +4516,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ - if (skb_is_tc_redirected(skb)) + if (skb_is_redirected(skb)) return XDP_PASS; /* XDP packets must be linear and must have sufficient headroom @@ -5063,7 +5063,7 @@ skip_taps: goto out; } #endif - skb_reset_tc(skb); + skb_reset_redirect(skb); skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; @@ -5195,7 +5195,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) * * More direct receive version of netif_receive_skb(). It should * only be used by callers that have a need to skip RPS and Generic XDP. - * Caller must also take care of handling if (page_is_)pfmemalloc. + * Caller must also take care of handling if ``(page_is_)pfmemalloc``. * * This function may only be called from softirq context and interrupts * should be enabled. @@ -9299,6 +9299,10 @@ int register_netdevice(struct net_device *dev) BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); BUG_ON(!net); + ret = ethtool_check_ops(dev->ethtool_ops); + if (ret) + return ret; + spin_lock_init(&dev->addr_list_lock); lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index dbaebbe573f0..547b587c1950 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -190,6 +190,9 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_TX_ONESTEP_P2P: tx_type_valid = 1; break; + case __HWTSTAMP_TX_CNT: + /* not a real value */ + break; } switch (rx_filter) { @@ -211,6 +214,9 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_FILTER_NTP_ALL: rx_filter_valid = 1; break; + case __HWTSTAMP_FILTER_CNT: + /* not a real value */ + break; } if (!tx_type_valid || !rx_filter_valid) diff --git a/net/core/devlink.c b/net/core/devlink.c index f51bebc8c33f..80f97722f31f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -344,7 +344,7 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, struct devlink_region { struct devlink *devlink; struct list_head list; - const char *name; + const struct devlink_region_ops *ops; struct list_head snapshot_list; u32 max_snapshots; u32 cur_snapshots; @@ -354,7 +354,6 @@ struct devlink_region { struct devlink_snapshot { struct list_head list; struct devlink_region *region; - devlink_snapshot_data_dest_t *data_destructor; u8 *data; u32 id; }; @@ -365,7 +364,7 @@ devlink_region_get_by_name(struct devlink *devlink, const char *region_name) struct devlink_region *region; list_for_each_entry(region, &devlink->region_list, list) - if (!strcmp(region->name, region_name)) + if (!strcmp(region->ops->name, region_name)) return region; return NULL; @@ -2710,7 +2709,7 @@ static struct net *devlink_netns_get(struct sk_buff *skb, struct net *net; if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) { - NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified"); + NL_SET_ERR_MSG_MOD(info->extack, "multiple netns identifying attributes specified"); return ERR_PTR(-EINVAL); } @@ -2728,7 +2727,7 @@ static struct net *devlink_netns_get(struct sk_buff *skb, net = ERR_PTR(-EINVAL); } if (IS_ERR(net)) { - NL_SET_ERR_MSG(info->extack, "Unknown network namespace"); + NL_SET_ERR_MSG_MOD(info->extack, "Unknown network namespace"); return ERR_PTR(-EINVAL); } if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { @@ -3695,7 +3694,7 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, if (err) goto nla_put_failure; - err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name); + err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name); if (err) goto nla_put_failure; @@ -3741,7 +3740,7 @@ static void devlink_nl_region_notify(struct devlink_region *region, goto out_cancel_msg; err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, - region->name); + region->ops->name); if (err) goto out_cancel_msg; @@ -3769,13 +3768,201 @@ out_free_msg: nlmsg_free(msg); } +/** + * __devlink_snapshot_id_increment - Increment number of snapshots using an id + * @devlink: devlink instance + * @id: the snapshot id + * + * Track when a new snapshot begins using an id. Load the count for the + * given id from the snapshot xarray, increment it, and store it back. + * + * Called when a new snapshot is created with the given id. + * + * The id *must* have been previously allocated by + * devlink_region_snapshot_id_get(). + * + * Returns 0 on success, or an error on failure. + */ +static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id) +{ + unsigned long count; + void *p; + + lockdep_assert_held(&devlink->lock); + + p = xa_load(&devlink->snapshot_ids, id); + if (WARN_ON(!p)) + return -EINVAL; + + if (WARN_ON(!xa_is_value(p))) + return -EINVAL; + + count = xa_to_value(p); + count++; + + return xa_err(xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), + GFP_KERNEL)); +} + +/** + * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id + * @devlink: devlink instance + * @id: the snapshot id + * + * Track when a snapshot is deleted and stops using an id. Load the count + * for the given id from the snapshot xarray, decrement it, and store it + * back. + * + * If the count reaches zero, erase this id from the xarray, freeing it + * up for future re-use by devlink_region_snapshot_id_get(). + * + * Called when a snapshot using the given id is deleted, and when the + * initial allocator of the id is finished using it. + */ +static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id) +{ + unsigned long count; + void *p; + + lockdep_assert_held(&devlink->lock); + + p = xa_load(&devlink->snapshot_ids, id); + if (WARN_ON(!p)) + return; + + if (WARN_ON(!xa_is_value(p))) + return; + + count = xa_to_value(p); + + if (count > 1) { + count--; + xa_store(&devlink->snapshot_ids, id, xa_mk_value(count), + GFP_KERNEL); + } else { + /* If this was the last user, we can erase this id */ + xa_erase(&devlink->snapshot_ids, id); + } +} + +/** + * __devlink_snapshot_id_insert - Insert a specific snapshot ID + * @devlink: devlink instance + * @id: the snapshot id + * + * Mark the given snapshot id as used by inserting a zero value into the + * snapshot xarray. + * + * This must be called while holding the devlink instance lock. Unlike + * devlink_snapshot_id_get, the initial reference count is zero, not one. + * It is expected that the id will immediately be used before + * releasing the devlink instance lock. + * + * Returns zero on success, or an error code if the snapshot id could not + * be inserted. + */ +static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id) +{ + lockdep_assert_held(&devlink->lock); + + if (WARN_ON(xa_load(&devlink->snapshot_ids, id))) + return -EEXIST; + + return xa_err(xa_store(&devlink->snapshot_ids, id, xa_mk_value(0), + GFP_KERNEL)); +} + +/** + * __devlink_region_snapshot_id_get - get snapshot ID + * @devlink: devlink instance + * @id: storage to return snapshot id + * + * Allocates a new snapshot id. Returns zero on success, or a negative + * error on failure. Must be called while holding the devlink instance + * lock. + * + * Snapshot IDs are tracked using an xarray which stores the number of + * users of the snapshot id. + * + * Note that the caller of this function counts as a 'user', in order to + * avoid race conditions. The caller must release its hold on the + * snapshot by using devlink_region_snapshot_id_put. + */ +static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) +{ + lockdep_assert_held(&devlink->lock); + + return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1), + xa_limit_32b, GFP_KERNEL); +} + +/** + * __devlink_region_snapshot_create - create a new snapshot + * This will add a new snapshot of a region. The snapshot + * will be stored on the region struct and can be accessed + * from devlink. This is useful for future analyses of snapshots. + * Multiple snapshots can be created on a region. + * The @snapshot_id should be obtained using the getter function. + * + * Must be called only while holding the devlink instance lock. + * + * @region: devlink region of the snapshot + * @data: snapshot data + * @snapshot_id: snapshot id to be created + */ +static int +__devlink_region_snapshot_create(struct devlink_region *region, + u8 *data, u32 snapshot_id) +{ + struct devlink *devlink = region->devlink; + struct devlink_snapshot *snapshot; + int err; + + lockdep_assert_held(&devlink->lock); + + /* check if region can hold one more snapshot */ + if (region->cur_snapshots == region->max_snapshots) + return -ENOSPC; + + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) + return -EEXIST; + + snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); + if (!snapshot) + return -ENOMEM; + + err = __devlink_snapshot_id_increment(devlink, snapshot_id); + if (err) + goto err_snapshot_id_increment; + + snapshot->id = snapshot_id; + snapshot->region = region; + snapshot->data = data; + + list_add_tail(&snapshot->list, ®ion->snapshot_list); + + region->cur_snapshots++; + + devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW); + return 0; + +err_snapshot_id_increment: + kfree(snapshot); + return err; +} + static void devlink_region_snapshot_del(struct devlink_region *region, struct devlink_snapshot *snapshot) { + struct devlink *devlink = region->devlink; + + lockdep_assert_held(&devlink->lock); + devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL); region->cur_snapshots--; list_del(&snapshot->list); - (*snapshot->data_destructor)(snapshot->data); + region->ops->destructor(snapshot->data); + __devlink_snapshot_id_decrement(devlink, snapshot->id); kfree(snapshot); } @@ -3878,6 +4065,71 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb, return 0; } +static int +devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_region *region; + const char *region_name; + u32 snapshot_id; + u8 *data; + int err; + + if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) { + NL_SET_ERR_MSG_MOD(info->extack, "No region name provided"); + return -EINVAL; + } + + if (!info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { + NL_SET_ERR_MSG_MOD(info->extack, "No snapshot id provided"); + return -EINVAL; + } + + region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); + region = devlink_region_get_by_name(devlink, region_name); + if (!region) { + NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not exist"); + return -EINVAL; + } + + if (!region->ops->snapshot) { + NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not support taking an immediate snapshot"); + return -EOPNOTSUPP; + } + + if (region->cur_snapshots == region->max_snapshots) { + NL_SET_ERR_MSG_MOD(info->extack, "The region has reached the maximum number of stored snapshots"); + return -ENOSPC; + } + + snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { + NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); + return -EEXIST; + } + + err = __devlink_snapshot_id_insert(devlink, snapshot_id); + if (err) + return err; + + err = region->ops->snapshot(devlink, info->extack, &data); + if (err) + goto err_snapshot_capture; + + err = __devlink_region_snapshot_create(region, data, snapshot_id); + if (err) + goto err_snapshot_create; + + return 0; + +err_snapshot_create: + region->ops->destructor(data); +err_snapshot_capture: + __devlink_snapshot_id_decrement(devlink, snapshot_id); + return err; +} + static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, struct devlink *devlink, u8 *chunk, u32 chunk_size, @@ -4837,6 +5089,7 @@ struct devlink_health_reporter { struct mutex dump_lock; /* lock parallel read/write from dump buffers */ u64 graceful_period; bool auto_recover; + bool auto_dump; u8 health_state; u64 dump_ts; u64 dump_real_ts; @@ -4872,14 +5125,12 @@ devlink_health_reporter_find_by_name(struct devlink *devlink, * @devlink: devlink * @ops: ops * @graceful_period: to avoid recovery loops, in msecs - * @auto_recover: auto recover when error occurs * @priv: priv */ struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, bool auto_recover, - void *priv) + u64 graceful_period, void *priv) { struct devlink_health_reporter *reporter; @@ -4889,8 +5140,7 @@ devlink_health_reporter_create(struct devlink *devlink, goto unlock; } - if (WARN_ON(auto_recover && !ops->recover) || - WARN_ON(graceful_period && !ops->recover)) { + if (WARN_ON(graceful_period && !ops->recover)) { reporter = ERR_PTR(-EINVAL); goto unlock; } @@ -4905,7 +5155,8 @@ devlink_health_reporter_create(struct devlink *devlink, reporter->ops = ops; reporter->devlink = devlink; reporter->graceful_period = graceful_period; - reporter->auto_recover = auto_recover; + reporter->auto_recover = !!ops->recover; + reporter->auto_dump = !!ops->dump; mutex_init(&reporter->dump_lock); refcount_set(&reporter->refcount, 1); list_add_tail(&reporter->list, &devlink->reporter_list); @@ -4986,6 +5237,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, reporter->dump_real_ts, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; + if (reporter->ops->dump && + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, + reporter->auto_dump)) + goto reporter_nest_cancel; nla_nest_end(msg, reporter_attr); genlmsg_end(msg, hdr); @@ -5132,10 +5387,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter, reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; - mutex_lock(&reporter->dump_lock); - /* store current dump of current error, for later analysis */ - devlink_health_do_dump(reporter, priv_ctx, NULL); - mutex_unlock(&reporter->dump_lock); + if (reporter->auto_dump) { + mutex_lock(&reporter->dump_lock); + /* store current dump of current error, for later analysis */ + devlink_health_do_dump(reporter, priv_ctx, NULL); + mutex_unlock(&reporter->dump_lock); + } if (reporter->auto_recover) return devlink_health_reporter_recover(reporter, @@ -5309,6 +5566,11 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, err = -EOPNOTSUPP; goto out; } + if (!reporter->ops->dump && + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) { + err = -EOPNOTSUPP; + goto out; + } if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) reporter->graceful_period = @@ -5318,6 +5580,10 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, reporter->auto_recover = nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) + reporter->auto_dump = + nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); + devlink_health_reporter_put(reporter); return 0; out: @@ -5454,18 +5720,35 @@ struct devlink_stats { }; /** + * struct devlink_trap_policer_item - Packet trap policer attributes. + * @policer: Immutable packet trap policer attributes. + * @rate: Rate in packets / sec. + * @burst: Burst size in packets. + * @list: trap_policer_list member. + * + * Describes packet trap policer attributes. Created by devlink during trap + * policer registration. + */ +struct devlink_trap_policer_item { + const struct devlink_trap_policer *policer; + u64 rate; + u64 burst; + struct list_head list; +}; + +/** * struct devlink_trap_group_item - Packet trap group attributes. * @group: Immutable packet trap group attributes. - * @refcount: Number of trap items using the group. + * @policer_item: Associated policer item. Can be NULL. * @list: trap_group_list member. * @stats: Trap group statistics. * * Describes packet trap group attributes. Created by devlink during trap - * registration. + * group registration. */ struct devlink_trap_group_item { const struct devlink_trap_group *group; - refcount_t refcount; + struct devlink_trap_policer_item *policer_item; struct list_head list; struct devlink_stats __percpu *stats; }; @@ -5491,6 +5774,19 @@ struct devlink_trap_item { void *priv; }; +static struct devlink_trap_policer_item * +devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id) +{ + struct devlink_trap_policer_item *policer_item; + + list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { + if (policer_item->policer->id == id) + return policer_item; + } + + return NULL; +} + static struct devlink_trap_item * devlink_trap_item_lookup(struct devlink *devlink, const char *name) { @@ -5818,6 +6114,19 @@ devlink_trap_group_item_lookup(struct devlink *devlink, const char *name) } static struct devlink_trap_group_item * +devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id) +{ + struct devlink_trap_group_item *group_item; + + list_for_each_entry(group_item, &devlink->trap_group_list, list) { + if (group_item->group->id == id) + return group_item; + } + + return NULL; +} + +static struct devlink_trap_group_item * devlink_trap_group_item_get_from_info(struct devlink *devlink, struct genl_info *info) { @@ -5854,6 +6163,11 @@ devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink, nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC)) goto nla_put_failure; + if (group_item->policer_item && + nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, + group_item->policer_item->policer->id)) + goto nla_put_failure; + err = devlink_trap_stats_put(msg, group_item->stats); if (err) goto nla_put_failure; @@ -5955,7 +6269,7 @@ __devlink_trap_group_action_set(struct devlink *devlink, int err; list_for_each_entry(trap_item, &devlink->trap_list, list) { - if (strcmp(trap_item->trap->group.name, group_name)) + if (strcmp(trap_item->group_item->group->name, group_name)) continue; err = __devlink_trap_action_set(devlink, trap_item, trap_action, extack); @@ -5969,7 +6283,7 @@ __devlink_trap_group_action_set(struct devlink *devlink, static int devlink_trap_group_action_set(struct devlink *devlink, struct devlink_trap_group_item *group_item, - struct genl_info *info) + struct genl_info *info, bool *p_modified) { enum devlink_trap_action trap_action; int err; @@ -5988,6 +6302,47 @@ devlink_trap_group_action_set(struct devlink *devlink, if (err) return err; + *p_modified = true; + + return 0; +} + +static int devlink_trap_group_set(struct devlink *devlink, + struct devlink_trap_group_item *group_item, + struct genl_info *info) +{ + struct devlink_trap_policer_item *policer_item; + struct netlink_ext_ack *extack = info->extack; + const struct devlink_trap_policer *policer; + struct nlattr **attrs = info->attrs; + int err; + + if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) + return 0; + + if (!devlink->ops->trap_group_set) + return -EOPNOTSUPP; + + policer_item = group_item->policer_item; + if (attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) { + u32 policer_id; + + policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); + policer_item = devlink_trap_policer_item_lookup(devlink, + policer_id); + if (policer_id && !policer_item) { + NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); + return -ENOENT; + } + } + policer = policer_item ? policer_item->policer : NULL; + + err = devlink->ops->trap_group_set(devlink, group_item->group, policer); + if (err) + return err; + + group_item->policer_item = policer_item; + return 0; } @@ -5997,6 +6352,7 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb, struct netlink_ext_ack *extack = info->extack; struct devlink *devlink = info->user_ptr[0]; struct devlink_trap_group_item *group_item; + bool modified = false; int err; if (list_empty(&devlink->trap_group_list)) @@ -6008,14 +6364,262 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb, return -ENOENT; } - err = devlink_trap_group_action_set(devlink, group_item, info); + err = devlink_trap_group_action_set(devlink, group_item, info, + &modified); if (err) return err; + err = devlink_trap_group_set(devlink, group_item, info); + if (err) + goto err_trap_group_set; + return 0; + +err_trap_group_set: + if (modified) + NL_SET_ERR_MSG_MOD(extack, "Trap group set failed, but some changes were committed already"); + return err; +} + +static struct devlink_trap_policer_item * +devlink_trap_policer_item_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + u32 id; + + if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) + return NULL; + id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]); + + return devlink_trap_policer_item_lookup(devlink, id); +} + +static int +devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_policer *policer) +{ + struct nlattr *attr; + u64 drops; + int err; + + if (!devlink->ops->trap_policer_counter_get) + return 0; + + err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops); + if (err) + return err; + + attr = nla_nest_start(msg, DEVLINK_ATTR_STATS); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + nla_nest_end(msg, attr); + + return 0; + +nla_put_failure: + nla_nest_cancel(msg, attr); + return -EMSGSIZE; +} + +static int +devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink, + const struct devlink_trap_policer_item *policer_item, + enum devlink_command cmd, u32 portid, u32 seq, + int flags) +{ + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID, + policer_item->policer->id)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_RATE, + policer_item->rate, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_BURST, + policer_item->burst, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + err = devlink_trap_policer_stats_put(msg, devlink, + policer_item->policer); + if (err) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_trap_policer_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_trap_policer_item *policer_item; + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + if (list_empty(&devlink->trap_policer_list)) + return -EOPNOTSUPP; + + policer_item = devlink_trap_policer_item_get_from_info(devlink, info); + if (!policer_item) { + NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); + return -ENOENT; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) + goto err_trap_policer_fill; + + return genlmsg_reply(msg, info); + +err_trap_policer_fill: + nlmsg_free(msg); + return err; +} + +static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + enum devlink_command cmd = DEVLINK_CMD_TRAP_POLICER_NEW; + struct devlink_trap_policer_item *policer_item; + u32 portid = NETLINK_CB(cb->skb).portid; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + mutex_lock(&devlink->lock); + list_for_each_entry(policer_item, &devlink->trap_policer_list, + list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_trap_policer_fill(msg, devlink, + policer_item, cmd, + portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int +devlink_trap_policer_set(struct devlink *devlink, + struct devlink_trap_policer_item *policer_item, + struct genl_info *info) +{ + struct netlink_ext_ack *extack = info->extack; + struct nlattr **attrs = info->attrs; + u64 rate, burst; + int err; + + rate = policer_item->rate; + burst = policer_item->burst; + + if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]) + rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]); + + if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]) + burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]); + + if (rate < policer_item->policer->min_rate) { + NL_SET_ERR_MSG_MOD(extack, "Policer rate lower than limit"); + return -EINVAL; + } + + if (rate > policer_item->policer->max_rate) { + NL_SET_ERR_MSG_MOD(extack, "Policer rate higher than limit"); + return -EINVAL; + } + + if (burst < policer_item->policer->min_burst) { + NL_SET_ERR_MSG_MOD(extack, "Policer burst size lower than limit"); + return -EINVAL; + } + + if (burst > policer_item->policer->max_burst) { + NL_SET_ERR_MSG_MOD(extack, "Policer burst size higher than limit"); + return -EINVAL; + } + + err = devlink->ops->trap_policer_set(devlink, policer_item->policer, + rate, burst, info->extack); + if (err) + return err; + + policer_item->rate = rate; + policer_item->burst = burst; + + return 0; +} + +static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_trap_policer_item *policer_item; + struct netlink_ext_ack *extack = info->extack; + struct devlink *devlink = info->user_ptr[0]; + + if (list_empty(&devlink->trap_policer_list)) + return -EOPNOTSUPP; + + if (!devlink->ops->trap_policer_set) + return -EOPNOTSUPP; + + policer_item = devlink_trap_policer_item_get_from_info(devlink, info); + if (!policer_item) { + NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer"); + return -ENOENT; + } + + return devlink_trap_policer_set(devlink, policer_item, info); } static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { + [DEVLINK_ATTR_UNSPEC] = { .strict_start_type = + DEVLINK_ATTR_TRAP_POLICER_ID }, [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, @@ -6053,6 +6657,10 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 }, [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 }, [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, + [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, + [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -6176,7 +6784,8 @@ static const struct genl_ops devlink_nl_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_eswitch_get_doit, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_ESWITCH_SET, @@ -6275,6 +6884,13 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { + .cmd = DEVLINK_CMD_REGION_NEW, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_region_new, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { .cmd = DEVLINK_CMD_REGION_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_region_del, @@ -6380,6 +6996,19 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_TRAP_POLICER_GET, + .doit = devlink_nl_cmd_trap_policer_get_doit, + .dumpit = devlink_nl_cmd_trap_policer_get_dumpit, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_TRAP_POLICER_SET, + .doit = devlink_nl_cmd_trap_policer_set_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -6417,6 +7046,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) if (!devlink) return NULL; devlink->ops = ops; + xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); __devlink_net_set(devlink, &init_net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); @@ -6427,6 +7057,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->reporter_list); INIT_LIST_HEAD(&devlink->trap_list); INIT_LIST_HEAD(&devlink->trap_group_list); + INIT_LIST_HEAD(&devlink->trap_policer_list); mutex_init(&devlink->lock); mutex_init(&devlink->reporters_lock); return devlink; @@ -6511,6 +7142,7 @@ void devlink_free(struct devlink *devlink) { mutex_destroy(&devlink->reporters_lock); mutex_destroy(&devlink->lock); + WARN_ON(!list_empty(&devlink->trap_policer_list)); WARN_ON(!list_empty(&devlink->trap_group_list)); WARN_ON(!list_empty(&devlink->trap_list)); WARN_ON(!list_empty(&devlink->reporter_list)); @@ -6521,6 +7153,8 @@ void devlink_free(struct devlink *devlink) WARN_ON(!list_empty(&devlink->sb_list)); WARN_ON(!list_empty(&devlink->port_list)); + xa_destroy(&devlink->snapshot_ids); + kfree(devlink); } EXPORT_SYMBOL_GPL(devlink_free); @@ -7636,21 +8270,24 @@ EXPORT_SYMBOL_GPL(devlink_param_value_str_fill); * devlink_region_create - create a new address region * * @devlink: devlink - * @region_name: region name + * @ops: region operations and name * @region_max_snapshots: Maximum supported number of snapshots for region * @region_size: size of region */ -struct devlink_region *devlink_region_create(struct devlink *devlink, - const char *region_name, - u32 region_max_snapshots, - u64 region_size) +struct devlink_region * +devlink_region_create(struct devlink *devlink, + const struct devlink_region_ops *ops, + u32 region_max_snapshots, u64 region_size) { struct devlink_region *region; int err = 0; + if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) + return ERR_PTR(-EINVAL); + mutex_lock(&devlink->lock); - if (devlink_region_get_by_name(devlink, region_name)) { + if (devlink_region_get_by_name(devlink, ops->name)) { err = -EEXIST; goto unlock; } @@ -7663,7 +8300,7 @@ struct devlink_region *devlink_region_create(struct devlink *devlink, region->devlink = devlink; region->max_snapshots = region_max_snapshots; - region->name = region_name; + region->ops = ops; region->size = region_size; INIT_LIST_HEAD(®ion->snapshot_list); list_add_tail(®ion->list, &devlink->region_list); @@ -7709,75 +8346,66 @@ EXPORT_SYMBOL_GPL(devlink_region_destroy); * Driver should use the same id for multiple snapshots taken * on multiple regions at the same time/by the same trigger. * + * The caller of this function must use devlink_region_snapshot_id_put + * when finished creating regions using this id. + * + * Returns zero on success, or a negative error code on failure. + * * @devlink: devlink + * @id: storage to return id */ -u32 devlink_region_snapshot_id_get(struct devlink *devlink) +int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id) { - u32 id; + int err; mutex_lock(&devlink->lock); - id = ++devlink->snapshot_id; + err = __devlink_region_snapshot_id_get(devlink, id); mutex_unlock(&devlink->lock); - return id; + return err; } EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get); /** + * devlink_region_snapshot_id_put - put snapshot ID reference + * + * This should be called by a driver after finishing creating snapshots + * with an id. Doing so ensures that the ID can later be released in the + * event that all snapshots using it have been destroyed. + * + * @devlink: devlink + * @id: id to release reference on + */ +void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id) +{ + mutex_lock(&devlink->lock); + __devlink_snapshot_id_decrement(devlink, id); + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put); + +/** * devlink_region_snapshot_create - create a new snapshot * This will add a new snapshot of a region. The snapshot * will be stored on the region struct and can be accessed - * from devlink. This is useful for future analyses of snapshots. + * from devlink. This is useful for future analyses of snapshots. * Multiple snapshots can be created on a region. * The @snapshot_id should be obtained using the getter function. * * @region: devlink region of the snapshot * @data: snapshot data * @snapshot_id: snapshot id to be created - * @data_destructor: pointer to destructor function to free data */ int devlink_region_snapshot_create(struct devlink_region *region, - u8 *data, u32 snapshot_id, - devlink_snapshot_data_dest_t *data_destructor) + u8 *data, u32 snapshot_id) { struct devlink *devlink = region->devlink; - struct devlink_snapshot *snapshot; int err; mutex_lock(&devlink->lock); - - /* check if region can hold one more snapshot */ - if (region->cur_snapshots == region->max_snapshots) { - err = -ENOMEM; - goto unlock; - } - - if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { - err = -EEXIST; - goto unlock; - } - - snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); - if (!snapshot) { - err = -ENOMEM; - goto unlock; - } - - snapshot->id = snapshot_id; - snapshot->region = region; - snapshot->data = data; - snapshot->data_destructor = data_destructor; - - list_add_tail(&snapshot->list, ®ion->snapshot_list); - - region->cur_snapshots++; - - devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW); + err = __devlink_region_snapshot_create(region, data, snapshot_id); mutex_unlock(&devlink->lock); - return 0; -unlock: - mutex_unlock(&devlink->lock); return err; } EXPORT_SYMBOL_GPL(devlink_region_snapshot_create); @@ -7866,7 +8494,7 @@ static int devlink_trap_driver_verify(const struct devlink_trap *trap) static int devlink_trap_verify(const struct devlink_trap *trap) { - if (!trap || !trap->name || !trap->group.name) + if (!trap || !trap->name) return -EINVAL; if (trap->generic) @@ -7937,108 +8565,22 @@ devlink_trap_group_notify(struct devlink *devlink, msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -static struct devlink_trap_group_item * -devlink_trap_group_item_create(struct devlink *devlink, - const struct devlink_trap_group *group) -{ - struct devlink_trap_group_item *group_item; - int err; - - err = devlink_trap_group_verify(group); - if (err) - return ERR_PTR(err); - - group_item = kzalloc(sizeof(*group_item), GFP_KERNEL); - if (!group_item) - return ERR_PTR(-ENOMEM); - - group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); - if (!group_item->stats) { - err = -ENOMEM; - goto err_stats_alloc; - } - - group_item->group = group; - refcount_set(&group_item->refcount, 1); - - if (devlink->ops->trap_group_init) { - err = devlink->ops->trap_group_init(devlink, group); - if (err) - goto err_group_init; - } - - list_add_tail(&group_item->list, &devlink->trap_group_list); - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_NEW); - - return group_item; - -err_group_init: - free_percpu(group_item->stats); -err_stats_alloc: - kfree(group_item); - return ERR_PTR(err); -} - -static void -devlink_trap_group_item_destroy(struct devlink *devlink, - struct devlink_trap_group_item *group_item) -{ - devlink_trap_group_notify(devlink, group_item, - DEVLINK_CMD_TRAP_GROUP_DEL); - list_del(&group_item->list); - free_percpu(group_item->stats); - kfree(group_item); -} - -static struct devlink_trap_group_item * -devlink_trap_group_item_get(struct devlink *devlink, - const struct devlink_trap_group *group) -{ - struct devlink_trap_group_item *group_item; - - group_item = devlink_trap_group_item_lookup(devlink, group->name); - if (group_item) { - refcount_inc(&group_item->refcount); - return group_item; - } - - return devlink_trap_group_item_create(devlink, group); -} - -static void -devlink_trap_group_item_put(struct devlink *devlink, - struct devlink_trap_group_item *group_item) -{ - if (!refcount_dec_and_test(&group_item->refcount)) - return; - - devlink_trap_group_item_destroy(devlink, group_item); -} - static int devlink_trap_item_group_link(struct devlink *devlink, struct devlink_trap_item *trap_item) { + u16 group_id = trap_item->trap->init_group_id; struct devlink_trap_group_item *group_item; - group_item = devlink_trap_group_item_get(devlink, - &trap_item->trap->group); - if (IS_ERR(group_item)) - return PTR_ERR(group_item); + group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id); + if (WARN_ON_ONCE(!group_item)) + return -EINVAL; trap_item->group_item = group_item; return 0; } -static void -devlink_trap_item_group_unlink(struct devlink *devlink, - struct devlink_trap_item *trap_item) -{ - devlink_trap_group_item_put(devlink, trap_item->group_item); -} - static void devlink_trap_notify(struct devlink *devlink, const struct devlink_trap_item *trap_item, enum devlink_command cmd) @@ -8101,7 +8643,6 @@ devlink_trap_register(struct devlink *devlink, return 0; err_trap_init: - devlink_trap_item_group_unlink(devlink, trap_item); err_group_link: free_percpu(trap_item->stats); err_stats_alloc: @@ -8122,7 +8663,6 @@ static void devlink_trap_unregister(struct devlink *devlink, list_del(&trap_item->list); if (devlink->ops->trap_fini) devlink->ops->trap_fini(devlink, trap, trap_item); - devlink_trap_item_group_unlink(devlink, trap_item); free_percpu(trap_item->stats); kfree(trap_item); } @@ -8278,6 +8818,288 @@ void *devlink_trap_ctx_priv(void *trap_ctx) } EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv); +static int +devlink_trap_group_item_policer_link(struct devlink *devlink, + struct devlink_trap_group_item *group_item) +{ + u32 policer_id = group_item->group->init_policer_id; + struct devlink_trap_policer_item *policer_item; + + if (policer_id == 0) + return 0; + + policer_item = devlink_trap_policer_item_lookup(devlink, policer_id); + if (WARN_ON_ONCE(!policer_item)) + return -EINVAL; + + group_item->policer_item = policer_item; + + return 0; +} + +static int +devlink_trap_group_register(struct devlink *devlink, + const struct devlink_trap_group *group) +{ + struct devlink_trap_group_item *group_item; + int err; + + if (devlink_trap_group_item_lookup(devlink, group->name)) + return -EEXIST; + + group_item = kzalloc(sizeof(*group_item), GFP_KERNEL); + if (!group_item) + return -ENOMEM; + + group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats); + if (!group_item->stats) { + err = -ENOMEM; + goto err_stats_alloc; + } + + group_item->group = group; + + err = devlink_trap_group_item_policer_link(devlink, group_item); + if (err) + goto err_policer_link; + + if (devlink->ops->trap_group_init) { + err = devlink->ops->trap_group_init(devlink, group); + if (err) + goto err_group_init; + } + + list_add_tail(&group_item->list, &devlink->trap_group_list); + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW); + + return 0; + +err_group_init: +err_policer_link: + free_percpu(group_item->stats); +err_stats_alloc: + kfree(group_item); + return err; +} + +static void +devlink_trap_group_unregister(struct devlink *devlink, + const struct devlink_trap_group *group) +{ + struct devlink_trap_group_item *group_item; + + group_item = devlink_trap_group_item_lookup(devlink, group->name); + if (WARN_ON_ONCE(!group_item)) + return; + + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_DEL); + list_del(&group_item->list); + free_percpu(group_item->stats); + kfree(group_item); +} + +/** + * devlink_trap_groups_register - Register packet trap groups with devlink. + * @devlink: devlink. + * @groups: Packet trap groups. + * @groups_count: Count of provided packet trap groups. + * + * Return: Non-zero value on failure. + */ +int devlink_trap_groups_register(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count) +{ + int i, err; + + mutex_lock(&devlink->lock); + for (i = 0; i < groups_count; i++) { + const struct devlink_trap_group *group = &groups[i]; + + err = devlink_trap_group_verify(group); + if (err) + goto err_trap_group_verify; + + err = devlink_trap_group_register(devlink, group); + if (err) + goto err_trap_group_register; + } + mutex_unlock(&devlink->lock); + + return 0; + +err_trap_group_register: +err_trap_group_verify: + for (i--; i >= 0; i--) + devlink_trap_group_unregister(devlink, &groups[i]); + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_trap_groups_register); + +/** + * devlink_trap_groups_unregister - Unregister packet trap groups from devlink. + * @devlink: devlink. + * @groups: Packet trap groups. + * @groups_count: Count of provided packet trap groups. + */ +void devlink_trap_groups_unregister(struct devlink *devlink, + const struct devlink_trap_group *groups, + size_t groups_count) +{ + int i; + + mutex_lock(&devlink->lock); + for (i = groups_count - 1; i >= 0; i--) + devlink_trap_group_unregister(devlink, &groups[i]); + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister); + +static void +devlink_trap_policer_notify(struct devlink *devlink, + const struct devlink_trap_policer_item *policer_item, + enum devlink_command cmd) +{ + struct sk_buff *msg; + int err; + + WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW && + cmd != DEVLINK_CMD_TRAP_POLICER_DEL); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0, + 0, 0); + if (err) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); +} + +static int +devlink_trap_policer_register(struct devlink *devlink, + const struct devlink_trap_policer *policer) +{ + struct devlink_trap_policer_item *policer_item; + int err; + + if (devlink_trap_policer_item_lookup(devlink, policer->id)) + return -EEXIST; + + policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL); + if (!policer_item) + return -ENOMEM; + + policer_item->policer = policer; + policer_item->rate = policer->init_rate; + policer_item->burst = policer->init_burst; + + if (devlink->ops->trap_policer_init) { + err = devlink->ops->trap_policer_init(devlink, policer); + if (err) + goto err_policer_init; + } + + list_add_tail(&policer_item->list, &devlink->trap_policer_list); + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW); + + return 0; + +err_policer_init: + kfree(policer_item); + return err; +} + +static void +devlink_trap_policer_unregister(struct devlink *devlink, + const struct devlink_trap_policer *policer) +{ + struct devlink_trap_policer_item *policer_item; + + policer_item = devlink_trap_policer_item_lookup(devlink, policer->id); + if (WARN_ON_ONCE(!policer_item)) + return; + + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_DEL); + list_del(&policer_item->list); + if (devlink->ops->trap_policer_fini) + devlink->ops->trap_policer_fini(devlink, policer); + kfree(policer_item); +} + +/** + * devlink_trap_policers_register - Register packet trap policers with devlink. + * @devlink: devlink. + * @policers: Packet trap policers. + * @policers_count: Count of provided packet trap policers. + * + * Return: Non-zero value on failure. + */ +int +devlink_trap_policers_register(struct devlink *devlink, + const struct devlink_trap_policer *policers, + size_t policers_count) +{ + int i, err; + + mutex_lock(&devlink->lock); + for (i = 0; i < policers_count; i++) { + const struct devlink_trap_policer *policer = &policers[i]; + + if (WARN_ON(policer->id == 0 || + policer->max_rate < policer->min_rate || + policer->max_burst < policer->min_burst)) { + err = -EINVAL; + goto err_trap_policer_verify; + } + + err = devlink_trap_policer_register(devlink, policer); + if (err) + goto err_trap_policer_register; + } + mutex_unlock(&devlink->lock); + + return 0; + +err_trap_policer_register: +err_trap_policer_verify: + for (i--; i >= 0; i--) + devlink_trap_policer_unregister(devlink, &policers[i]); + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_trap_policers_register); + +/** + * devlink_trap_policers_unregister - Unregister packet trap policers from devlink. + * @devlink: devlink. + * @policers: Packet trap policers. + * @policers_count: Count of provided packet trap policers. + */ +void +devlink_trap_policers_unregister(struct devlink *devlink, + const struct devlink_trap_policer *policers, + size_t policers_count) +{ + int i; + + mutex_lock(&devlink->lock); + for (i = policers_count - 1; i >= 0; i--) + devlink_trap_policer_unregister(devlink, &policers[i]); + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_trap_policers_unregister); + static void __devlink_compat_running_version(struct devlink *devlink, char *buf, size_t len) { diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 7440e6117c81..e951b743bed3 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -511,7 +511,8 @@ EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister); void flow_indr_block_call(struct net_device *dev, struct flow_block_offload *bo, - enum flow_block_command command) + enum flow_block_command command, + enum tc_setup_type type) { struct flow_indr_block_cb *indr_block_cb; struct flow_indr_block_dev *indr_dev; @@ -521,8 +522,7 @@ void flow_indr_block_call(struct net_device *dev, return; list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, - bo); + indr_block_cb->cb(dev, indr_block_cb->cb_priv, type, bo); } EXPORT_SYMBOL_GPL(flow_indr_block_call); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 99a6de52b21d..7d3438215f32 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -367,7 +367,7 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, }; -static int bpf_build_state(struct nlattr *nla, +static int bpf_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 2f9c0de533c7..8ec7d13d2860 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "BPF"; case LWTUNNEL_ENCAP_SEG6_LOCAL: return "SEG6LOCAL"; + case LWTUNNEL_ENCAP_RPL: + return "RPL"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: @@ -98,7 +100,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, } EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops); -int lwtunnel_build_state(u16 encap_type, +int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack) @@ -122,7 +124,7 @@ int lwtunnel_build_state(u16 encap_type, rcu_read_unlock(); if (found) { - ret = ops->build_state(encap, family, cfg, lws, extack); + ret = ops->build_state(net, encap, family, cfg, lws, extack); if (ret) module_put(ops->owner); } else { diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 626db912fce4..ef98372facf6 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -43,9 +43,11 @@ static int page_pool_init(struct page_pool *pool, * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, * which is the XDP_TX use-case. */ - if ((pool->p.dma_dir != DMA_FROM_DEVICE) && - (pool->p.dma_dir != DMA_BIDIRECTIONAL)) - return -EINVAL; + if (pool->p.flags & PP_FLAG_DMA_MAP) { + if ((pool->p.dma_dir != DMA_FROM_DEVICE) && + (pool->p.dma_dir != DMA_BIDIRECTIONAL)) + return -EINVAL; + } if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { /* In order to request DMA-sync-for-device the page diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f2b3d8dd40f4..08e2811b5274 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3362,7 +3362,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) /* skb was 'freed' by stack, so clean few * bits and reuse it */ - skb_reset_tc(skb); + skb_reset_redirect(skb); } while (--burst > 0); goto out; /* Skips xmit_mode M_START_XMIT */ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e1101a4f90a6..621b4479fee1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3926,14 +3926,21 @@ normal: goto perform_csum_check; if (!sg) { - if (!nskb->remcsum_offload) - nskb->ip_summed = CHECKSUM_NONE; - SKB_GSO_CB(nskb)->csum = - skb_copy_and_csum_bits(head_skb, offset, - skb_put(nskb, len), - len, 0); - SKB_GSO_CB(nskb)->csum_start = - skb_headroom(nskb) + doffset; + if (!csum) { + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = + skb_copy_and_csum_bits(head_skb, offset, + skb_put(nskb, + len), + len, 0); + SKB_GSO_CB(nskb)->csum_start = + skb_headroom(nskb) + doffset; + } else { + skb_copy_bits(head_skb, offset, + skb_put(nskb, len), + len); + } continue; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index a7075b3b4489..b08dfae10f88 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -299,8 +299,11 @@ static void sock_map_free(struct bpf_map *map) struct bpf_stab *stab = container_of(map, struct bpf_stab, map); int i; + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ synchronize_rcu(); - raw_spin_lock_bh(&stab->lock); for (i = 0; i < stab->map.max_entries; i++) { struct sock **psk = &stab->sks[i]; struct sock *sk; @@ -314,7 +317,6 @@ static void sock_map_free(struct bpf_map *map) release_sock(sk); } } - raw_spin_unlock_bh(&stab->lock); /* wait for psock readers accessing its map link */ synchronize_rcu(); @@ -1008,10 +1010,13 @@ static void sock_hash_free(struct bpf_map *map) struct hlist_node *node; int i; + /* After the sync no updates or deletes will be in-flight so it + * is safe to walk map and remove entries without risking a race + * in EEXIST update case. + */ synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); - raw_spin_lock_bh(&bucket->lock); hlist_for_each_entry_safe(elem, node, &bucket->head, node) { hlist_del_rcu(&elem->node); lock_sock(elem->sk); @@ -1020,7 +1025,6 @@ static void sock_hash_free(struct bpf_map *map) rcu_read_unlock(); release_sock(elem->sk); } - raw_spin_unlock_bh(&bucket->lock); } /* wait for psock readers accessing its map link */ diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 08c3dc45f1a4..06b9983325cc 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1173,7 +1173,7 @@ make_route: if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, DST_HOST); + rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, 0); if (rt == NULL) goto e_nobufs; @@ -1439,7 +1439,7 @@ static int dn_route_input_slow(struct sk_buff *skb) } make_route: - rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, DST_HOST); + rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, 0); if (rt == NULL) goto e_nobufs; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 17281fec710c..ee2610c4d46a 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -88,13 +88,9 @@ const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol) { struct dsa_tag_driver *dsa_tag_driver; const struct dsa_device_ops *ops; - char module_name[128]; bool found = false; - snprintf(module_name, 127, "%s%d", DSA_TAG_DRIVER_ALIAS, - tag_protocol); - - request_module(module_name); + request_module("%s%d", DSA_TAG_DRIVER_ALIAS, tag_protocol); mutex_lock(&dsa_tag_drivers_lock); list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index e7c30b472034..9a271a58a41d 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -18,8 +18,8 @@ #include "dsa_priv.h" -static LIST_HEAD(dsa_tree_list); static DEFINE_MUTEX(dsa2_mutex); +LIST_HEAD(dsa_tree_list); static const struct devlink_ops dsa_devlink_ops = { }; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 760e6ea3178a..904cc7c9b882 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -22,6 +22,7 @@ enum { DSA_NOTIFIER_MDB_DEL, DSA_NOTIFIER_VLAN_ADD, DSA_NOTIFIER_VLAN_DEL, + DSA_NOTIFIER_MTU, }; /* DSA_NOTIFIER_AGEING_TIME */ @@ -61,6 +62,14 @@ struct dsa_notifier_vlan_info { int port; }; +/* DSA_NOTIFIER_MTU */ +struct dsa_notifier_mtu_info { + bool propagate_upstream; + int sw_index; + int port; + int mtu; +}; + struct dsa_slave_priv { /* Copy of CPU port xmit for faster access in slave transmit hot path */ struct sk_buff * (*xmit)(struct sk_buff *skb, @@ -127,6 +136,8 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct switchdev_trans *trans); int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, struct switchdev_trans *trans); +int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, + bool propagate_upstream); int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, u16 vid); int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, @@ -183,4 +194,8 @@ dsa_slave_to_master(const struct net_device *dev) /* switch.c */ int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); + +/* dsa2.c */ +extern struct list_head dsa_tree_list; + #endif diff --git a/net/dsa/master.c b/net/dsa/master.c index bd44bde272f4..b5c535af63a3 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -314,20 +314,6 @@ static const struct attribute_group dsa_group = { .attrs = dsa_slave_attrs, }; -static void dsa_master_set_mtu(struct net_device *dev, struct dsa_port *cpu_dp) -{ - unsigned int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead; - int err; - - rtnl_lock(); - if (mtu <= dev->max_mtu) { - err = dev_set_mtu(dev, mtu); - if (err) - netdev_dbg(dev, "Unable to set MTU to include for DSA overheads\n"); - } - rtnl_unlock(); -} - static void dsa_master_reset_mtu(struct net_device *dev) { int err; @@ -344,7 +330,12 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { int ret; - dsa_master_set_mtu(dev, cpu_dp); + rtnl_lock(); + ret = dev_set_mtu(dev, ETH_DATA_LEN + cpu_dp->tag_ops->overhead); + rtnl_unlock(); + if (ret) + netdev_warn(dev, "error %d setting MTU to include DSA overhead\n", + ret); /* If we use a tagging format that doesn't have an ethertype * field, make sure that all packets from this point on get diff --git a/net/dsa/port.c b/net/dsa/port.c index a18e65a474a5..231b2d494f1c 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -297,6 +297,19 @@ int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, return ds->ops->port_egress_floods(ds, port, true, mrouter); } +int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, + bool propagate_upstream) +{ + struct dsa_notifier_mtu_info info = { + .sw_index = dp->ds->index, + .propagate_upstream = propagate_upstream, + .port = dp->index, + .mtu = new_mtu, + }; + + return dsa_port_notify(dp, DSA_NOTIFIER_MTU, &info); +} + int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, u16 vid) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index c5beb3031a72..9692a726f2ed 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -842,63 +842,137 @@ dsa_slave_mall_tc_entry_find(struct net_device *dev, unsigned long cookie) return NULL; } -static int dsa_slave_add_cls_matchall(struct net_device *dev, - struct tc_cls_matchall_offload *cls, - bool ingress) +static int +dsa_slave_add_cls_matchall_mirred(struct net_device *dev, + struct tc_cls_matchall_offload *cls, + bool ingress) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_mall_mirror_tc_entry *mirror; struct dsa_mall_tc_entry *mall_tc_entry; - __be16 protocol = cls->common.protocol; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; struct dsa_port *to_dp; - int err = -EOPNOTSUPP; + int err; + + act = &cls->rule->action.entries[0]; if (!ds->ops->port_mirror_add) - return err; + return -EOPNOTSUPP; - if (!flow_offload_has_one_action(&cls->rule->action)) - return err; + if (!act->dev) + return -EINVAL; - if (!flow_action_basic_hw_stats_types_check(&cls->rule->action, - cls->common.extack)) - return err; + if (!flow_action_basic_hw_stats_check(&cls->rule->action, + cls->common.extack)) + return -EOPNOTSUPP; act = &cls->rule->action.entries[0]; - if (act->id == FLOW_ACTION_MIRRED && protocol == htons(ETH_P_ALL)) { - struct dsa_mall_mirror_tc_entry *mirror; + if (!dsa_slave_dev_check(act->dev)) + return -EOPNOTSUPP; + + mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); + if (!mall_tc_entry) + return -ENOMEM; - if (!act->dev) - return -EINVAL; + mall_tc_entry->cookie = cls->cookie; + mall_tc_entry->type = DSA_PORT_MALL_MIRROR; + mirror = &mall_tc_entry->mirror; - if (!dsa_slave_dev_check(act->dev)) - return -EOPNOTSUPP; + to_dp = dsa_slave_to_port(act->dev); + + mirror->to_local_port = to_dp->index; + mirror->ingress = ingress; - mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); - if (!mall_tc_entry) - return -ENOMEM; + err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress); + if (err) { + kfree(mall_tc_entry); + return err; + } - mall_tc_entry->cookie = cls->cookie; - mall_tc_entry->type = DSA_PORT_MALL_MIRROR; - mirror = &mall_tc_entry->mirror; + list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); + + return err; +} - to_dp = dsa_slave_to_port(act->dev); +static int +dsa_slave_add_cls_matchall_police(struct net_device *dev, + struct tc_cls_matchall_offload *cls, + bool ingress) +{ + struct netlink_ext_ack *extack = cls->common.extack; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_mall_policer_tc_entry *policer; + struct dsa_mall_tc_entry *mall_tc_entry; + struct dsa_switch *ds = dp->ds; + struct flow_action_entry *act; + int err; - mirror->to_local_port = to_dp->index; - mirror->ingress = ingress; + if (!ds->ops->port_policer_add) { + NL_SET_ERR_MSG_MOD(extack, + "Policing offload not implemented\n"); + return -EOPNOTSUPP; + } - err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress); - if (err) { - kfree(mall_tc_entry); - return err; + if (!ingress) { + NL_SET_ERR_MSG_MOD(extack, + "Only supported on ingress qdisc\n"); + return -EOPNOTSUPP; + } + + if (!flow_action_basic_hw_stats_check(&cls->rule->action, + cls->common.extack)) + return -EOPNOTSUPP; + + list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) { + if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) { + NL_SET_ERR_MSG_MOD(extack, + "Only one port policer allowed\n"); + return -EEXIST; } + } + + act = &cls->rule->action.entries[0]; + + mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); + if (!mall_tc_entry) + return -ENOMEM; + + mall_tc_entry->cookie = cls->cookie; + mall_tc_entry->type = DSA_PORT_MALL_POLICER; + policer = &mall_tc_entry->policer; + policer->rate_bytes_per_sec = act->police.rate_bytes_ps; + policer->burst = act->police.burst; - list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); + err = ds->ops->port_policer_add(ds, dp->index, policer); + if (err) { + kfree(mall_tc_entry); + return err; } - return 0; + list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); + + return err; +} + +static int dsa_slave_add_cls_matchall(struct net_device *dev, + struct tc_cls_matchall_offload *cls, + bool ingress) +{ + int err = -EOPNOTSUPP; + + if (cls->common.protocol == htons(ETH_P_ALL) && + flow_offload_has_one_action(&cls->rule->action) && + cls->rule->action.entries[0].id == FLOW_ACTION_MIRRED) + err = dsa_slave_add_cls_matchall_mirred(dev, cls, ingress); + else if (flow_offload_has_one_action(&cls->rule->action) && + cls->rule->action.entries[0].id == FLOW_ACTION_POLICE) + err = dsa_slave_add_cls_matchall_police(dev, cls, ingress); + + return err; } static void dsa_slave_del_cls_matchall(struct net_device *dev, @@ -908,9 +982,6 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev, struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; - if (!ds->ops->port_mirror_del) - return; - mall_tc_entry = dsa_slave_mall_tc_entry_find(dev, cls->cookie); if (!mall_tc_entry) return; @@ -919,7 +990,13 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev, switch (mall_tc_entry->type) { case DSA_PORT_MALL_MIRROR: - ds->ops->port_mirror_del(ds, dp->index, &mall_tc_entry->mirror); + if (ds->ops->port_mirror_del) + ds->ops->port_mirror_del(ds, dp->index, + &mall_tc_entry->mirror); + break; + case DSA_PORT_MALL_POLICER: + if (ds->ops->port_policer_del) + ds->ops->port_policer_del(ds, dp->index); break; default: WARN_ON(1); @@ -1218,6 +1295,208 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, return dsa_port_vid_del(dp, vid); } +struct dsa_hw_port { + struct list_head list; + struct net_device *dev; + int old_mtu; +}; + +static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu) +{ + const struct dsa_hw_port *p; + int err; + + list_for_each_entry(p, hw_port_list, list) { + if (p->dev->mtu == mtu) + continue; + + err = dev_set_mtu(p->dev, mtu); + if (err) + goto rollback; + } + + return 0; + +rollback: + list_for_each_entry_continue_reverse(p, hw_port_list, list) { + if (p->dev->mtu == p->old_mtu) + continue; + + if (dev_set_mtu(p->dev, p->old_mtu)) + netdev_err(p->dev, "Failed to restore MTU\n"); + } + + return err; +} + +static void dsa_hw_port_list_free(struct list_head *hw_port_list) +{ + struct dsa_hw_port *p, *n; + + list_for_each_entry_safe(p, n, hw_port_list, list) + kfree(p); +} + +/* Make the hardware datapath to/from @dev limited to a common MTU */ +void dsa_bridge_mtu_normalization(struct dsa_port *dp) +{ + struct list_head hw_port_list; + struct dsa_switch_tree *dst; + int min_mtu = ETH_MAX_MTU; + struct dsa_port *other_dp; + int err; + + if (!dp->ds->mtu_enforcement_ingress) + return; + + if (!dp->bridge_dev) + return; + + INIT_LIST_HEAD(&hw_port_list); + + /* Populate the list of ports that are part of the same bridge + * as the newly added/modified port + */ + list_for_each_entry(dst, &dsa_tree_list, list) { + list_for_each_entry(other_dp, &dst->ports, list) { + struct dsa_hw_port *hw_port; + struct net_device *slave; + + if (other_dp->type != DSA_PORT_TYPE_USER) + continue; + + if (other_dp->bridge_dev != dp->bridge_dev) + continue; + + if (!other_dp->ds->mtu_enforcement_ingress) + continue; + + slave = other_dp->slave; + + if (min_mtu > slave->mtu) + min_mtu = slave->mtu; + + hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL); + if (!hw_port) + goto out; + + hw_port->dev = slave; + hw_port->old_mtu = slave->mtu; + + list_add(&hw_port->list, &hw_port_list); + } + } + + /* Attempt to configure the entire hardware bridge to the newly added + * interface's MTU first, regardless of whether the intention of the + * user was to raise or lower it. + */ + err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->slave->mtu); + if (!err) + goto out; + + /* Clearly that didn't work out so well, so just set the minimum MTU on + * all hardware bridge ports now. If this fails too, then all ports will + * still have their old MTU rolled back anyway. + */ + dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu); + +out: + dsa_hw_port_list_free(&hw_port_list); +} + +static int dsa_slave_change_mtu(struct net_device *dev, int new_mtu) +{ + struct net_device *master = dsa_slave_to_master(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->dp->ds; + struct dsa_port *cpu_dp; + int port = p->dp->index; + int largest_mtu = 0; + int new_master_mtu; + int old_master_mtu; + int mtu_limit; + int cpu_mtu; + int err, i; + + if (!ds->ops->port_change_mtu) + return -EOPNOTSUPP; + + for (i = 0; i < ds->num_ports; i++) { + int slave_mtu; + + if (!dsa_is_user_port(ds, i)) + continue; + + /* During probe, this function will be called for each slave + * device, while not all of them have been allocated. That's + * ok, it doesn't change what the maximum is, so ignore it. + */ + if (!dsa_to_port(ds, i)->slave) + continue; + + /* Pretend that we already applied the setting, which we + * actually haven't (still haven't done all integrity checks) + */ + if (i == port) + slave_mtu = new_mtu; + else + slave_mtu = dsa_to_port(ds, i)->slave->mtu; + + if (largest_mtu < slave_mtu) + largest_mtu = slave_mtu; + } + + cpu_dp = dsa_to_port(ds, port)->cpu_dp; + + mtu_limit = min_t(int, master->max_mtu, dev->max_mtu); + old_master_mtu = master->mtu; + new_master_mtu = largest_mtu + cpu_dp->tag_ops->overhead; + if (new_master_mtu > mtu_limit) + return -ERANGE; + + /* If the master MTU isn't over limit, there's no need to check the CPU + * MTU, since that surely isn't either. + */ + cpu_mtu = largest_mtu; + + /* Start applying stuff */ + if (new_master_mtu != old_master_mtu) { + err = dev_set_mtu(master, new_master_mtu); + if (err < 0) + goto out_master_failed; + + /* We only need to propagate the MTU of the CPU port to + * upstream switches. + */ + err = dsa_port_mtu_change(cpu_dp, cpu_mtu, true); + if (err) + goto out_cpu_failed; + } + + err = dsa_port_mtu_change(dp, new_mtu, false); + if (err) + goto out_port_failed; + + dev->mtu = new_mtu; + + dsa_bridge_mtu_normalization(dp); + + return 0; + +out_port_failed: + if (new_master_mtu != old_master_mtu) + dsa_port_mtu_change(cpu_dp, old_master_mtu - + cpu_dp->tag_ops->overhead, + true); +out_cpu_failed: + if (new_master_mtu != old_master_mtu) + dev_set_mtu(master, old_master_mtu); +out_master_failed: + return err; +} + static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_drvinfo = dsa_slave_get_drvinfo, .get_regs_len = dsa_slave_get_regs_len, @@ -1295,6 +1574,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_vlan_rx_add_vid = dsa_slave_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid, .ndo_get_devlink_port = dsa_slave_get_devlink_port, + .ndo_change_mtu = dsa_slave_change_mtu, }; static struct device_type dsa_type = { @@ -1465,7 +1745,10 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->priv_flags |= IFF_NO_QUEUE; slave_dev->netdev_ops = &dsa_slave_netdev_ops; slave_dev->min_mtu = 0; - slave_dev->max_mtu = ETH_MAX_MTU; + if (ds->ops->port_max_mtu) + slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index); + else + slave_dev->max_mtu = ETH_MAX_MTU; SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); SET_NETDEV_DEV(slave_dev, port->ds->dev); @@ -1483,6 +1766,15 @@ int dsa_slave_create(struct dsa_port *port) p->xmit = cpu_dp->tag_ops->xmit; port->slave = slave_dev; + rtnl_lock(); + ret = dsa_slave_change_mtu(slave_dev, ETH_DATA_LEN); + rtnl_unlock(); + if (ret && ret != -EOPNOTSUPP) { + dev_err(ds->dev, "error %d setting MTU on port %d\n", + ret, port->index); + goto out_free; + } + netif_carrier_off(slave_dev); ret = dsa_slave_phy_setup(slave_dev); @@ -1545,6 +1837,8 @@ static int dsa_slave_changeupper(struct net_device *dev, if (netif_is_bridge_master(info->upper_dev)) { if (info->linking) { err = dsa_port_bridge_join(dp, info->upper_dev); + if (!err) + dsa_bridge_mtu_normalization(dp); err = notifier_from_errno(err); } else { dsa_port_bridge_leave(dp, info->upper_dev); diff --git a/net/dsa/switch.c b/net/dsa/switch.c index df4abe897ed6..f3c32ff552b3 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -52,6 +52,40 @@ static int dsa_switch_ageing_time(struct dsa_switch *ds, return 0; } +static bool dsa_switch_mtu_match(struct dsa_switch *ds, int port, + struct dsa_notifier_mtu_info *info) +{ + if (ds->index == info->sw_index) + return (port == info->port) || dsa_is_dsa_port(ds, port); + + if (!info->propagate_upstream) + return false; + + if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) + return true; + + return false; +} + +static int dsa_switch_mtu(struct dsa_switch *ds, + struct dsa_notifier_mtu_info *info) +{ + int port, ret; + + if (!ds->ops->port_change_mtu) + return -EOPNOTSUPP; + + for (port = 0; port < ds->num_ports; port++) { + if (dsa_switch_mtu_match(ds, port, info)) { + ret = ds->ops->port_change_mtu(ds, port, info->mtu); + if (ret) + return ret; + } + } + + return 0; +} + static int dsa_switch_bridge_join(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) { @@ -328,6 +362,9 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_VLAN_DEL: err = dsa_switch_vlan_del(ds, info); break; + case DSA_NOTIFIER_MTU: + err = dsa_switch_mtu(ds, info); + break; default: err = -EOPNOTSUPP; break; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 2fb6c26294b5..b97ad93d1c1a 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -298,47 +298,4 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); -/* In the DSA packet_type handler, skb->data points in the middle of the VLAN - * tag, after tpid and before tci. This is because so far, ETH_HLEN - * (DMAC, SMAC, EtherType) bytes were pulled. - * There are 2 bytes of VLAN tag left in skb->data, and upper - * layers expect the 'real' EtherType to be consumed as well. - * Coincidentally, a VLAN header is also of the same size as - * the number of bytes that need to be pulled. - * - * skb_mac_header skb->data - * | | - * v v - * | | | | | | | | | | | | | | | | | | | - * +-----------------------+-----------------------+-------+-------+-------+ - * | Destination MAC | Source MAC | TPID | TCI | EType | - * +-----------------------+-----------------------+-------+-------+-------+ - * ^ | | - * |<--VLAN_HLEN-->to <---VLAN_HLEN---> - * from | - * >>>>>>> v - * >>>>>>> | | | | | | | | | | | | | | | - * >>>>>>> +-----------------------+-----------------------+-------+ - * >>>>>>> | Destination MAC | Source MAC | EType | - * +-----------------------+-----------------------+-------+ - * ^ ^ - * (now part of | | - * skb->head) skb_mac_header skb->data - */ -struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb) -{ - u8 *from = skb_mac_header(skb); - u8 *dest = from + VLAN_HLEN; - - memmove(dest, from, ETH_HLEN - VLAN_HLEN); - skb_pull(skb, VLAN_HLEN); - skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - skb_pull_rcsum(skb, ETH_HLEN); - - return skb; -} -EXPORT_SYMBOL_GPL(dsa_8021q_remove_header); - MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 9c3114179690..cc8512b5f9e2 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -140,8 +140,31 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); + skb->offload_fwd_mark = 1; + return skb; } + +static int brcm_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto, + int *offset) +{ + /* We have been called on the DSA master network device after + * eth_type_trans() which pulled the Ethernet header already. + * Frames have one of these two layouts: + * ----------------------------------- + * | MAC DA | MAC SA | 4b tag | Type | DSA_TAG_PROTO_BRCM + * ----------------------------------- + * ----------------------------------- + * | 4b tag | MAC DA | MAC SA | Type | DSA_TAG_PROTO_BRCM_PREPEND + * ----------------------------------- + * skb->data points 2 bytes before the actual Ethernet type field and + * we have an offset of 4bytes between where skb->data and where the + * payload starts. + */ + *offset = BRCM_TAG_LEN; + *proto = ((__be16 *)skb->data)[1]; + return 0; +} #endif #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM) @@ -177,6 +200,7 @@ static const struct dsa_device_ops brcm_netdev_ops = { .xmit = brcm_tag_xmit, .rcv = brcm_tag_rcv, .overhead = BRCM_TAG_LEN, + .flow_dissect = brcm_tag_flow_dissect, }; DSA_TAG_DRIVER(brcm_netdev_ops); @@ -205,6 +229,7 @@ static const struct dsa_device_ops brcm_prepend_netdev_ops = { .xmit = brcm_tag_xmit_prepend, .rcv = brcm_tag_rcv_prepend, .overhead = BRCM_TAG_LEN, + .flow_dissect = brcm_tag_flow_dissect, }; DSA_TAG_DRIVER(brcm_prepend_netdev_ops); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 5366ea430349..d553bf36bd41 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -250,14 +250,14 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, { struct sja1105_meta meta = {0}; int source_port, switch_id; - struct vlan_ethhdr *hdr; + struct ethhdr *hdr; u16 tpid, vid, tci; bool is_link_local; bool is_tagged; bool is_meta; - hdr = vlan_eth_hdr(skb); - tpid = ntohs(hdr->h_vlan_proto); + hdr = eth_hdr(skb); + tpid = ntohs(hdr->h_proto); is_tagged = (tpid == ETH_P_SJA1105); is_link_local = sja1105_is_link_local(skb); is_meta = sja1105_is_meta_frame(skb); @@ -266,7 +266,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, if (is_tagged) { /* Normal traffic path. */ - tci = ntohs(hdr->h_vlan_TCI); + skb_push_rcsum(skb, ETH_HLEN); + __skb_vlan_pop(skb, &tci); + skb_pull_rcsum(skb, ETH_HLEN); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + vid = tci & VLAN_VID_MASK; source_port = dsa_8021q_rx_source_port(vid); switch_id = dsa_8021q_rx_switch_id(vid); @@ -295,12 +300,6 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, return NULL; } - /* Delete/overwrite fake VLAN header, DSA expects to not find - * it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN). - */ - if (is_tagged) - skb = dsa_8021q_remove_header(skb); - return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local, is_meta); } diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index b0bd3decad02..6c360c9c9370 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -6,4 +6,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ - channels.o + channels.o coalesce.o pause.o eee.o tsinfo.o diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c new file mode 100644 index 000000000000..6afd99042d67 --- /dev/null +++ b/net/ethtool/coalesce.c @@ -0,0 +1,353 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" + +struct coalesce_req_info { + struct ethnl_req_info base; +}; + +struct coalesce_reply_data { + struct ethnl_reply_data base; + struct ethtool_coalesce coalesce; + u32 supported_params; +}; + +#define COALESCE_REPDATA(__reply_base) \ + container_of(__reply_base, struct coalesce_reply_data, base) + +#define __SUPPORTED_OFFSET ETHTOOL_A_COALESCE_RX_USECS +static u32 attr_to_mask(unsigned int attr_type) +{ + return BIT(attr_type - __SUPPORTED_OFFSET); +} + +/* build time check that indices in ethtool_ops::supported_coalesce_params + * match corresponding attribute types with an offset + */ +#define __CHECK_SUPPORTED_OFFSET(x) \ + static_assert((ETHTOOL_ ## x) == \ + BIT((ETHTOOL_A_ ## x) - __SUPPORTED_OFFSET)) +__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS); +__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES); +__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_IRQ); +__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_IRQ); +__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS); +__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES); +__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_IRQ); +__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_IRQ); +__CHECK_SUPPORTED_OFFSET(COALESCE_STATS_BLOCK_USECS); +__CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_RX); +__CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_TX); +__CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_LOW); +__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_LOW); +__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_LOW); +__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_LOW); +__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_LOW); +__CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_HIGH); +__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_HIGH); +__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_HIGH); +__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_HIGH); +__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_HIGH); +__CHECK_SUPPORTED_OFFSET(COALESCE_RATE_SAMPLE_INTERVAL); + +static const struct nla_policy +coalesce_get_policy[ETHTOOL_A_COALESCE_MAX + 1] = { + [ETHTOOL_A_COALESCE_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_COALESCE_RX_USECS] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_RX_MAX_FRAMES] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_RX_USECS_IRQ] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_TX_USECS] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_TX_MAX_FRAMES] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_TX_USECS_IRQ] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_STATS_BLOCK_USECS] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_PKT_RATE_LOW] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_RX_USECS_LOW] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_TX_USECS_LOW] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_PKT_RATE_HIGH] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_RX_USECS_HIGH] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_REJECT }, +}; + +static int coalesce_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + if (!dev->ethtool_ops->get_coalesce) + return -EOPNOTSUPP; + data->supported_params = dev->ethtool_ops->supported_coalesce_params; + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce); + ethnl_ops_complete(dev); + + return ret; +} + +static int coalesce_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u32)) + /* _RX_USECS */ + nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES */ + nla_total_size(sizeof(u32)) + /* _RX_USECS_IRQ */ + nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_IRQ */ + nla_total_size(sizeof(u32)) + /* _TX_USECS */ + nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES */ + nla_total_size(sizeof(u32)) + /* _TX_USECS_IRQ */ + nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_IRQ */ + nla_total_size(sizeof(u32)) + /* _STATS_BLOCK_USECS */ + nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_RX */ + nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_TX */ + nla_total_size(sizeof(u32)) + /* _PKT_RATE_LOW */ + nla_total_size(sizeof(u32)) + /* _RX_USECS_LOW */ + nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_LOW */ + nla_total_size(sizeof(u32)) + /* _TX_USECS_LOW */ + nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_LOW */ + nla_total_size(sizeof(u32)) + /* _PKT_RATE_HIGH */ + nla_total_size(sizeof(u32)) + /* _RX_USECS_HIGH */ + nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_HIGH */ + nla_total_size(sizeof(u32)) + /* _TX_USECS_HIGH */ + nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */ + nla_total_size(sizeof(u32)); /* _RATE_SAMPLE_INTERVAL */ +} + +static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val, + u32 supported_params) +{ + if (!val && !(supported_params & attr_to_mask(attr_type))) + return false; + return nla_put_u32(skb, attr_type, val); +} + +static bool coalesce_put_bool(struct sk_buff *skb, u16 attr_type, u32 val, + u32 supported_params) +{ + if (!val && !(supported_params & attr_to_mask(attr_type))) + return false; + return nla_put_u8(skb, attr_type, !!val); +} + +static int coalesce_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); + const struct ethtool_coalesce *coal = &data->coalesce; + u32 supported = data->supported_params; + + if (coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS, + coal->rx_coalesce_usecs, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES, + coal->rx_max_coalesced_frames, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_IRQ, + coal->rx_coalesce_usecs_irq, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, + coal->rx_max_coalesced_frames_irq, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS, + coal->tx_coalesce_usecs, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES, + coal->tx_max_coalesced_frames, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_IRQ, + coal->tx_coalesce_usecs_irq, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, + coal->tx_max_coalesced_frames_irq, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, + coal->stats_block_coalesce_usecs, supported) || + coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, + coal->use_adaptive_rx_coalesce, supported) || + coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, + coal->use_adaptive_tx_coalesce, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_LOW, + coal->pkt_rate_low, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_LOW, + coal->rx_coalesce_usecs_low, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, + coal->rx_max_coalesced_frames_low, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_LOW, + coal->tx_coalesce_usecs_low, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, + coal->tx_max_coalesced_frames_low, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_HIGH, + coal->pkt_rate_high, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_HIGH, + coal->rx_coalesce_usecs_high, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, + coal->rx_max_coalesced_frames_high, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_HIGH, + coal->tx_coalesce_usecs_high, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, + coal->tx_max_coalesced_frames_high, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, + coal->rate_sample_interval, supported)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_coalesce_request_ops = { + .request_cmd = ETHTOOL_MSG_COALESCE_GET, + .reply_cmd = ETHTOOL_MSG_COALESCE_GET_REPLY, + .hdr_attr = ETHTOOL_A_COALESCE_HEADER, + .max_attr = ETHTOOL_A_COALESCE_MAX, + .req_info_size = sizeof(struct coalesce_req_info), + .reply_data_size = sizeof(struct coalesce_reply_data), + .request_policy = coalesce_get_policy, + + .prepare_data = coalesce_prepare_data, + .reply_size = coalesce_reply_size, + .fill_reply = coalesce_fill_reply, +}; + +/* COALESCE_SET */ + +static const struct nla_policy +coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1] = { + [ETHTOOL_A_COALESCE_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_COALESCE_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_COALESCE_RX_USECS] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RX_MAX_FRAMES] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RX_USECS_IRQ] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_USECS] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_MAX_FRAMES] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_USECS_IRQ] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_STATS_BLOCK_USECS] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX] = { .type = NLA_U8 }, + [ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX] = { .type = NLA_U8 }, + [ETHTOOL_A_COALESCE_PKT_RATE_LOW] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RX_USECS_LOW] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_USECS_LOW] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_PKT_RATE_HIGH] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RX_USECS_HIGH] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 }, +}; + +int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_COALESCE_MAX + 1]; + struct ethtool_coalesce coalesce = {}; + struct ethnl_req_info req_info = {}; + const struct ethtool_ops *ops; + struct net_device *dev; + u32 supported_params; + bool mod = false; + int ret; + u16 a; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_COALESCE_MAX, coalesce_set_policy, + info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_COALESCE_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + dev = req_info.dev; + ops = dev->ethtool_ops; + ret = -EOPNOTSUPP; + if (!ops->get_coalesce || !ops->set_coalesce) + goto out_dev; + + /* make sure that only supported parameters are present */ + supported_params = ops->supported_coalesce_params; + for (a = ETHTOOL_A_COALESCE_RX_USECS; a < __ETHTOOL_A_COALESCE_CNT; a++) + if (tb[a] && !(supported_params & attr_to_mask(a))) { + ret = -EINVAL; + NL_SET_ERR_MSG_ATTR(info->extack, tb[a], + "cannot modify an unsupported parameter"); + goto out_dev; + } + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + ret = ops->get_coalesce(dev, &coalesce); + if (ret < 0) + goto out_ops; + + ethnl_update_u32(&coalesce.rx_coalesce_usecs, + tb[ETHTOOL_A_COALESCE_RX_USECS], &mod); + ethnl_update_u32(&coalesce.rx_max_coalesced_frames, + tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES], &mod); + ethnl_update_u32(&coalesce.rx_coalesce_usecs_irq, + tb[ETHTOOL_A_COALESCE_RX_USECS_IRQ], &mod); + ethnl_update_u32(&coalesce.rx_max_coalesced_frames_irq, + tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ], &mod); + ethnl_update_u32(&coalesce.tx_coalesce_usecs, + tb[ETHTOOL_A_COALESCE_TX_USECS], &mod); + ethnl_update_u32(&coalesce.tx_max_coalesced_frames, + tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES], &mod); + ethnl_update_u32(&coalesce.tx_coalesce_usecs_irq, + tb[ETHTOOL_A_COALESCE_TX_USECS_IRQ], &mod); + ethnl_update_u32(&coalesce.tx_max_coalesced_frames_irq, + tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ], &mod); + ethnl_update_u32(&coalesce.stats_block_coalesce_usecs, + tb[ETHTOOL_A_COALESCE_STATS_BLOCK_USECS], &mod); + ethnl_update_bool32(&coalesce.use_adaptive_rx_coalesce, + tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX], &mod); + ethnl_update_bool32(&coalesce.use_adaptive_tx_coalesce, + tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX], &mod); + ethnl_update_u32(&coalesce.pkt_rate_low, + tb[ETHTOOL_A_COALESCE_PKT_RATE_LOW], &mod); + ethnl_update_u32(&coalesce.rx_coalesce_usecs_low, + tb[ETHTOOL_A_COALESCE_RX_USECS_LOW], &mod); + ethnl_update_u32(&coalesce.rx_max_coalesced_frames_low, + tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW], &mod); + ethnl_update_u32(&coalesce.tx_coalesce_usecs_low, + tb[ETHTOOL_A_COALESCE_TX_USECS_LOW], &mod); + ethnl_update_u32(&coalesce.tx_max_coalesced_frames_low, + tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW], &mod); + ethnl_update_u32(&coalesce.pkt_rate_high, + tb[ETHTOOL_A_COALESCE_PKT_RATE_HIGH], &mod); + ethnl_update_u32(&coalesce.rx_coalesce_usecs_high, + tb[ETHTOOL_A_COALESCE_RX_USECS_HIGH], &mod); + ethnl_update_u32(&coalesce.rx_max_coalesced_frames_high, + tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH], &mod); + ethnl_update_u32(&coalesce.tx_coalesce_usecs_high, + tb[ETHTOOL_A_COALESCE_TX_USECS_HIGH], &mod); + ethnl_update_u32(&coalesce.tx_max_coalesced_frames_high, + tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH], &mod); + ethnl_update_u32(&coalesce.rate_sample_interval, + tb[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL], &mod); + ret = 0; + if (!mod) + goto out_ops; + + ret = dev->ethtool_ops->set_coalesce(dev, &coalesce); + if (ret < 0) + goto out_ops; + ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); +out_dev: + dev_put(dev); + return ret; +} diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 0b22741b2f8f..423e640e3876 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/net_tstamp.h> +#include <linux/phy.h> + #include "common.h" const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { @@ -60,6 +63,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list", + [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload", }; const char @@ -203,6 +207,53 @@ const char wol_mode_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(wol_mode_names) == WOL_MODE_COUNT); +const char sof_timestamping_names[][ETH_GSTRING_LEN] = { + [const_ilog2(SOF_TIMESTAMPING_TX_HARDWARE)] = "hardware-transmit", + [const_ilog2(SOF_TIMESTAMPING_TX_SOFTWARE)] = "software-transmit", + [const_ilog2(SOF_TIMESTAMPING_RX_HARDWARE)] = "hardware-receive", + [const_ilog2(SOF_TIMESTAMPING_RX_SOFTWARE)] = "software-receive", + [const_ilog2(SOF_TIMESTAMPING_SOFTWARE)] = "software-system-clock", + [const_ilog2(SOF_TIMESTAMPING_SYS_HARDWARE)] = "hardware-legacy-clock", + [const_ilog2(SOF_TIMESTAMPING_RAW_HARDWARE)] = "hardware-raw-clock", + [const_ilog2(SOF_TIMESTAMPING_OPT_ID)] = "option-id", + [const_ilog2(SOF_TIMESTAMPING_TX_SCHED)] = "sched-transmit", + [const_ilog2(SOF_TIMESTAMPING_TX_ACK)] = "ack-transmit", + [const_ilog2(SOF_TIMESTAMPING_OPT_CMSG)] = "option-cmsg", + [const_ilog2(SOF_TIMESTAMPING_OPT_TSONLY)] = "option-tsonly", + [const_ilog2(SOF_TIMESTAMPING_OPT_STATS)] = "option-stats", + [const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)] = "option-pktinfo", + [const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)] = "option-tx-swhw", +}; +static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT); + +const char ts_tx_type_names[][ETH_GSTRING_LEN] = { + [HWTSTAMP_TX_OFF] = "off", + [HWTSTAMP_TX_ON] = "on", + [HWTSTAMP_TX_ONESTEP_SYNC] = "onestep-sync", + [HWTSTAMP_TX_ONESTEP_P2P] = "onestep-p2p", +}; +static_assert(ARRAY_SIZE(ts_tx_type_names) == __HWTSTAMP_TX_CNT); + +const char ts_rx_filter_names[][ETH_GSTRING_LEN] = { + [HWTSTAMP_FILTER_NONE] = "none", + [HWTSTAMP_FILTER_ALL] = "all", + [HWTSTAMP_FILTER_SOME] = "some", + [HWTSTAMP_FILTER_PTP_V1_L4_EVENT] = "ptpv1-l4-event", + [HWTSTAMP_FILTER_PTP_V1_L4_SYNC] = "ptpv1-l4-sync", + [HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ] = "ptpv1-l4-delay-req", + [HWTSTAMP_FILTER_PTP_V2_L4_EVENT] = "ptpv2-l4-event", + [HWTSTAMP_FILTER_PTP_V2_L4_SYNC] = "ptpv2-l4-sync", + [HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ] = "ptpv2-l4-delay-req", + [HWTSTAMP_FILTER_PTP_V2_L2_EVENT] = "ptpv2-l2-event", + [HWTSTAMP_FILTER_PTP_V2_L2_SYNC] = "ptpv2-l2-sync", + [HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ] = "ptpv2-l2-delay-req", + [HWTSTAMP_FILTER_PTP_V2_EVENT] = "ptpv2-event", + [HWTSTAMP_FILTER_PTP_V2_SYNC] = "ptpv2-sync", + [HWTSTAMP_FILTER_PTP_V2_DELAY_REQ] = "ptpv2-delay-req", + [HWTSTAMP_FILTER_NTP_ALL] = "ntp-all", +}; +static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT); + /* return false if legacy contained non-0 deprecated fields * maxtxpkt/maxrxpkt. rest of ksettings always updated */ @@ -289,3 +340,34 @@ out: kfree(indir); return ret; } + +int ethtool_check_ops(const struct ethtool_ops *ops) +{ + if (WARN_ON(ops->set_coalesce && !ops->supported_coalesce_params)) + return -EINVAL; + /* NOTE: sufficiently insane drivers may swap ethtool_ops at runtime, + * the fact that ops are checked at registration time does not + * mean the ops attached to a netdev later on are sane. + */ + return 0; +} + +int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct phy_device *phydev = dev->phydev; + + memset(info, 0, sizeof(*info)); + info->cmd = ETHTOOL_GET_TS_INFO; + + if (phy_has_tsinfo(phydev)) + return phy_ts_info(phydev, info); + if (ops->get_ts_info) + return ops->get_ts_info(dev, info); + + info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + info->phc_index = -1; + + return 0; +} diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 03946e16e623..a62f68ccc43a 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -12,6 +12,8 @@ #define ETHTOOL_LINK_MODE(speed, type, duplex) \ ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT +#define __SOF_TIMESTAMPING_CNT (const_ilog2(SOF_TIMESTAMPING_LAST) + 1) + extern const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]; extern const char @@ -23,6 +25,9 @@ phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN]; extern const char link_mode_names[][ETH_GSTRING_LEN]; extern const char netif_msg_class_names[][ETH_GSTRING_LEN]; extern const char wol_mode_names[][ETH_GSTRING_LEN]; +extern const char sof_timestamping_names[][ETH_GSTRING_LEN]; +extern const char ts_tx_type_names[][ETH_GSTRING_LEN]; +extern const char ts_rx_filter_names[][ETH_GSTRING_LEN]; int __ethtool_get_link(struct net_device *dev); @@ -30,5 +35,6 @@ bool convert_legacy_settings_to_link_ksettings( struct ethtool_link_ksettings *link_ksettings, const struct ethtool_cmd *legacy_settings); int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max); +int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info); #endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c index 87f288ee20c8..1bd026a29f3f 100644 --- a/net/ethtool/debug.c +++ b/net/ethtool/debug.c @@ -109,8 +109,9 @@ int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; dev = req_info.dev; + ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_msglevel || !dev->ethtool_ops->set_msglevel) - return -EOPNOTSUPP; + goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); @@ -131,6 +132,7 @@ out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); +out_dev: dev_put(dev); return ret; } diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c new file mode 100644 index 000000000000..94aa19cff22f --- /dev/null +++ b/net/ethtool/eee.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +#define EEE_MODES_COUNT \ + (sizeof_field(struct ethtool_eee, supported) * BITS_PER_BYTE) + +struct eee_req_info { + struct ethnl_req_info base; +}; + +struct eee_reply_data { + struct ethnl_reply_data base; + struct ethtool_eee eee; +}; + +#define EEE_REPDATA(__reply_base) \ + container_of(__reply_base, struct eee_reply_data, base) + +static const struct nla_policy +eee_get_policy[ETHTOOL_A_EEE_MAX + 1] = { + [ETHTOOL_A_EEE_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_EEE_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_EEE_MODES_OURS] = { .type = NLA_REJECT }, + [ETHTOOL_A_EEE_MODES_PEER] = { .type = NLA_REJECT }, + [ETHTOOL_A_EEE_ACTIVE] = { .type = NLA_REJECT }, + [ETHTOOL_A_EEE_ENABLED] = { .type = NLA_REJECT }, + [ETHTOOL_A_EEE_TX_LPI_ENABLED] = { .type = NLA_REJECT }, + [ETHTOOL_A_EEE_TX_LPI_TIMER] = { .type = NLA_REJECT }, +}; + +static int eee_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct eee_reply_data *data = EEE_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + if (!dev->ethtool_ops->get_eee) + return -EOPNOTSUPP; + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + ret = dev->ethtool_ops->get_eee(dev, &data->eee); + ethnl_ops_complete(dev); + + return ret; +} + +static int eee_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct eee_reply_data *data = EEE_REPDATA(reply_base); + const struct ethtool_eee *eee = &data->eee; + int len = 0; + int ret; + + BUILD_BUG_ON(sizeof(eee->advertised) * BITS_PER_BYTE != + EEE_MODES_COUNT); + BUILD_BUG_ON(sizeof(eee->lp_advertised) * BITS_PER_BYTE != + EEE_MODES_COUNT); + + /* MODES_OURS */ + ret = ethnl_bitset32_size(&eee->advertised, &eee->supported, + EEE_MODES_COUNT, link_mode_names, compact); + if (ret < 0) + return ret; + len += ret; + /* MODES_PEERS */ + ret = ethnl_bitset32_size(&eee->lp_advertised, NULL, + EEE_MODES_COUNT, link_mode_names, compact); + if (ret < 0) + return ret; + len += ret; + + len += nla_total_size(sizeof(u8)) + /* _EEE_ACTIVE */ + nla_total_size(sizeof(u8)) + /* _EEE_ENABLED */ + nla_total_size(sizeof(u8)) + /* _EEE_TX_LPI_ENABLED */ + nla_total_size(sizeof(u32)); /* _EEE_TX_LPI_TIMER */ + + return len; +} + +static int eee_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct eee_reply_data *data = EEE_REPDATA(reply_base); + const struct ethtool_eee *eee = &data->eee; + int ret; + + ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_OURS, + &eee->advertised, &eee->supported, + EEE_MODES_COUNT, link_mode_names, compact); + if (ret < 0) + return ret; + ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_PEER, + &eee->lp_advertised, NULL, EEE_MODES_COUNT, + link_mode_names, compact); + if (ret < 0) + return ret; + + if (nla_put_u8(skb, ETHTOOL_A_EEE_ACTIVE, !!eee->eee_active) || + nla_put_u8(skb, ETHTOOL_A_EEE_ENABLED, !!eee->eee_enabled) || + nla_put_u8(skb, ETHTOOL_A_EEE_TX_LPI_ENABLED, + !!eee->tx_lpi_enabled) || + nla_put_u32(skb, ETHTOOL_A_EEE_TX_LPI_TIMER, eee->tx_lpi_timer)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_eee_request_ops = { + .request_cmd = ETHTOOL_MSG_EEE_GET, + .reply_cmd = ETHTOOL_MSG_EEE_GET_REPLY, + .hdr_attr = ETHTOOL_A_EEE_HEADER, + .max_attr = ETHTOOL_A_EEE_MAX, + .req_info_size = sizeof(struct eee_req_info), + .reply_data_size = sizeof(struct eee_reply_data), + .request_policy = eee_get_policy, + + .prepare_data = eee_prepare_data, + .reply_size = eee_reply_size, + .fill_reply = eee_fill_reply, +}; + +/* EEE_SET */ + +static const struct nla_policy +eee_set_policy[ETHTOOL_A_EEE_MAX + 1] = { + [ETHTOOL_A_EEE_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_EEE_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_EEE_MODES_OURS] = { .type = NLA_NESTED }, + [ETHTOOL_A_EEE_MODES_PEER] = { .type = NLA_REJECT }, + [ETHTOOL_A_EEE_ACTIVE] = { .type = NLA_REJECT }, + [ETHTOOL_A_EEE_ENABLED] = { .type = NLA_U8 }, + [ETHTOOL_A_EEE_TX_LPI_ENABLED] = { .type = NLA_U8 }, + [ETHTOOL_A_EEE_TX_LPI_TIMER] = { .type = NLA_U32 }, +}; + +int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_EEE_MAX + 1]; + struct ethtool_eee eee = {}; + struct ethnl_req_info req_info = {}; + const struct ethtool_ops *ops; + struct net_device *dev; + bool mod = false; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_EEE_MAX, + eee_set_policy, info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_EEE_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + dev = req_info.dev; + ops = dev->ethtool_ops; + ret = -EOPNOTSUPP; + if (!ops->get_eee || !ops->set_eee) + goto out_dev; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + ret = ops->get_eee(dev, &eee); + if (ret < 0) + goto out_ops; + + ret = ethnl_update_bitset32(&eee.advertised, EEE_MODES_COUNT, + tb[ETHTOOL_A_EEE_MODES_OURS], + link_mode_names, info->extack, &mod); + if (ret < 0) + goto out_ops; + ethnl_update_bool32(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod); + ethnl_update_bool32(&eee.tx_lpi_enabled, + tb[ETHTOOL_A_EEE_TX_LPI_ENABLED], &mod); + ethnl_update_bool32(&eee.tx_lpi_timer, tb[ETHTOOL_A_EEE_TX_LPI_TIMER], + &mod); + ret = 0; + if (!mod) + goto out_ops; + + ret = dev->ethtool_ops->set_eee(dev, &eee); + if (ret < 0) + goto out_ops; + ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); +out_dev: + dev_put(dev); + return ret; +} diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 258840b19fb5..89d0b1827aaf 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -196,7 +196,7 @@ static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) switch (eth_cmd) { case ETHTOOL_GTXCSUM: case ETHTOOL_STXCSUM: - return NETIF_F_CSUM_MASK | NETIF_F_FCOE_CRC_BIT | + return NETIF_F_CSUM_MASK | NETIF_F_FCOE_CRC | NETIF_F_SCTP_CRC; case ETHTOOL_GRXCSUM: case ETHTOOL_SRXCSUM: @@ -1354,6 +1354,7 @@ static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) { struct ethtool_eee edata; + int ret; if (!dev->ethtool_ops->set_eee) return -EOPNOTSUPP; @@ -1361,7 +1362,10 @@ static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; - return dev->ethtool_ops->set_eee(dev, &edata); + ret = dev->ethtool_ops->set_eee(dev, &edata); + if (!ret) + ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL); + return ret; } static int ethtool_nway_reset(struct net_device *dev) @@ -1519,9 +1523,6 @@ ethtool_set_coalesce_supported(struct net_device *dev, u32 supported_params = dev->ethtool_ops->supported_coalesce_params; u32 nonzero_params = 0; - if (!supported_params) - return true; - if (coalesce->rx_coalesce_usecs) nonzero_params |= ETHTOOL_COALESCE_RX_USECS; if (coalesce->rx_max_coalesced_frames) @@ -1574,6 +1575,7 @@ static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce; + int ret; if (!dev->ethtool_ops->set_coalesce) return -EOPNOTSUPP; @@ -1584,7 +1586,10 @@ static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, if (!ethtool_set_coalesce_supported(dev, &coalesce)) return -EOPNOTSUPP; - return dev->ethtool_ops->set_coalesce(dev, &coalesce); + ret = dev->ethtool_ops->set_coalesce(dev, &coalesce); + if (!ret) + ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL); + return ret; } static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) @@ -1704,6 +1709,7 @@ static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) { struct ethtool_pauseparam pauseparam; + int ret; if (!dev->ethtool_ops->set_pauseparam) return -EOPNOTSUPP; @@ -1711,7 +1717,10 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) return -EFAULT; - return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); + ret = dev->ethtool_ops->set_pauseparam(dev, &pauseparam); + if (!ret) + ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL); + return ret; } static int ethtool_self_test(struct net_device *dev, char __user *useraddr) @@ -2131,32 +2140,17 @@ out: static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) { - int err = 0; struct ethtool_ts_info info; - const struct ethtool_ops *ops = dev->ethtool_ops; - struct phy_device *phydev = dev->phydev; - - memset(&info, 0, sizeof(info)); - info.cmd = ETHTOOL_GET_TS_INFO; - - if (phy_has_tsinfo(phydev)) { - err = phy_ts_info(phydev, &info); - } else if (ops->get_ts_info) { - err = ops->get_ts_info(dev, &info); - } else { - info.so_timestamping = - SOF_TIMESTAMPING_RX_SOFTWARE | - SOF_TIMESTAMPING_SOFTWARE; - info.phc_index = -1; - } + int err; + err = __ethtool_get_ts_info(dev, &info); if (err) return err; if (copy_to_user(useraddr, &info, sizeof(info))) - err = -EFAULT; + return -EFAULT; - return err; + return 0; } static int __ethtool_get_module_info(struct net_device *dev, diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index 2df420068cbb..677068deb68c 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -128,9 +128,10 @@ int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; dev = req_info.dev; + ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_link_ksettings || !dev->ethtool_ops->set_link_ksettings) - return -EOPNOTSUPP; + goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); @@ -164,6 +165,7 @@ out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); +out_dev: dev_put(dev); return ret; } diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index cb29cc8c5960..452608c6d856 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -341,9 +341,10 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; dev = req_info.dev; + ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_link_ksettings || !dev->ethtool_ops->set_link_ksettings) - return -EOPNOTSUPP; + goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); @@ -373,6 +374,7 @@ out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); +out_dev: dev_put(dev); return ret; } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 55c8ce4019d9..0c772318c023 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -40,6 +40,7 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, struct nlattr *tb[ETHTOOL_A_HEADER_MAX + 1]; const struct nlattr *devname_attr; struct net_device *dev = NULL; + u32 flags = 0; int ret; if (!header) { @@ -50,8 +51,17 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, ethnl_header_policy, extack); if (ret < 0) return ret; - devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME]; + if (tb[ETHTOOL_A_HEADER_FLAGS]) { + flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]); + if (flags & ~ETHTOOL_FLAG_ALL) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_HEADER_FLAGS], + "unrecognized request flags"); + nl_set_extack_cookie_u32(extack, ETHTOOL_FLAG_ALL); + return -EOPNOTSUPP; + } + } + devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME]; if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) { u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]); @@ -90,9 +100,7 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, } req_info->dev = dev; - if (tb[ETHTOOL_A_HEADER_FLAGS]) - req_info->flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]); - + req_info->flags = flags; return 0; } @@ -219,6 +227,10 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_PRIVFLAGS_GET] = ðnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_GET] = ðnl_rings_request_ops, [ETHTOOL_MSG_CHANNELS_GET] = ðnl_channels_request_ops, + [ETHTOOL_MSG_COALESCE_GET] = ðnl_coalesce_request_ops, + [ETHTOOL_MSG_PAUSE_GET] = ðnl_pause_request_ops, + [ETHTOOL_MSG_EEE_GET] = ðnl_eee_request_ops, + [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -535,6 +547,9 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_PRIVFLAGS_NTF] = ðnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_NTF] = ðnl_rings_request_ops, [ETHTOOL_MSG_CHANNELS_NTF] = ðnl_channels_request_ops, + [ETHTOOL_MSG_COALESCE_NTF] = ðnl_coalesce_request_ops, + [ETHTOOL_MSG_PAUSE_NTF] = ðnl_pause_request_ops, + [ETHTOOL_MSG_EEE_NTF] = ðnl_eee_request_ops, }; /* default notification handler */ @@ -624,6 +639,9 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_PRIVFLAGS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_RINGS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_CHANNELS_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_COALESCE_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) @@ -778,6 +796,49 @@ static const struct genl_ops ethtool_genl_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_set_channels, }, + { + .cmd = ETHTOOL_MSG_COALESCE_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, + { + .cmd = ETHTOOL_MSG_COALESCE_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_coalesce, + }, + { + .cmd = ETHTOOL_MSG_PAUSE_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, + { + .cmd = ETHTOOL_MSG_PAUSE_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_pause, + }, + { + .cmd = ETHTOOL_MSG_EEE_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, + { + .cmd = ETHTOOL_MSG_EEE_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_eee, + }, + { + .cmd = ETHTOOL_MSG_TSINFO_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 45aad99a6021..81b8fa020bcb 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -341,6 +341,10 @@ extern const struct ethnl_request_ops ethnl_features_request_ops; extern const struct ethnl_request_ops ethnl_privflags_request_ops; extern const struct ethnl_request_ops ethnl_rings_request_ops; extern const struct ethnl_request_ops ethnl_channels_request_ops; +extern const struct ethnl_request_ops ethnl_coalesce_request_ops; +extern const struct ethnl_request_ops ethnl_pause_request_ops; +extern const struct ethnl_request_ops ethnl_eee_request_ops; +extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); @@ -350,5 +354,8 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info); int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info); int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c new file mode 100644 index 000000000000..7aea35d1e8a5 --- /dev/null +++ b/net/ethtool/pause.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" + +struct pause_req_info { + struct ethnl_req_info base; +}; + +struct pause_reply_data { + struct ethnl_reply_data base; + struct ethtool_pauseparam pauseparam; +}; + +#define PAUSE_REPDATA(__reply_base) \ + container_of(__reply_base, struct pause_reply_data, base) + +static const struct nla_policy +pause_get_policy[ETHTOOL_A_PAUSE_MAX + 1] = { + [ETHTOOL_A_PAUSE_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_PAUSE_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_REJECT }, + [ETHTOOL_A_PAUSE_RX] = { .type = NLA_REJECT }, + [ETHTOOL_A_PAUSE_TX] = { .type = NLA_REJECT }, +}; + +static int pause_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct pause_reply_data *data = PAUSE_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + if (!dev->ethtool_ops->get_pauseparam) + return -EOPNOTSUPP; + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); + ethnl_ops_complete(dev); + + return 0; +} + +static int pause_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + return nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ + nla_total_size(sizeof(u8)) + /* _PAUSE_RX */ + nla_total_size(sizeof(u8)); /* _PAUSE_TX */ +} + +static int pause_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct pause_reply_data *data = PAUSE_REPDATA(reply_base); + const struct ethtool_pauseparam *pauseparam = &data->pauseparam; + + if (nla_put_u8(skb, ETHTOOL_A_PAUSE_AUTONEG, !!pauseparam->autoneg) || + nla_put_u8(skb, ETHTOOL_A_PAUSE_RX, !!pauseparam->rx_pause) || + nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_pause_request_ops = { + .request_cmd = ETHTOOL_MSG_PAUSE_GET, + .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY, + .hdr_attr = ETHTOOL_A_PAUSE_HEADER, + .max_attr = ETHTOOL_A_PAUSE_MAX, + .req_info_size = sizeof(struct pause_req_info), + .reply_data_size = sizeof(struct pause_reply_data), + .request_policy = pause_get_policy, + + .prepare_data = pause_prepare_data, + .reply_size = pause_reply_size, + .fill_reply = pause_fill_reply, +}; + +/* PAUSE_SET */ + +static const struct nla_policy +pause_set_policy[ETHTOOL_A_PAUSE_MAX + 1] = { + [ETHTOOL_A_PAUSE_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_PAUSE_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 }, + [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 }, + [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, +}; + +int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_PAUSE_MAX + 1]; + struct ethtool_pauseparam params = {}; + struct ethnl_req_info req_info = {}; + const struct ethtool_ops *ops; + struct net_device *dev; + bool mod = false; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_PAUSE_MAX, + pause_set_policy, info->extack); + if (ret < 0) + return ret; + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_PAUSE_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + dev = req_info.dev; + ops = dev->ethtool_ops; + ret = -EOPNOTSUPP; + if (!ops->get_pauseparam || !ops->set_pauseparam) + goto out_dev; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + ops->get_pauseparam(dev, ¶ms); + + ethnl_update_bool32(¶ms.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod); + ethnl_update_bool32(¶ms.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod); + ethnl_update_bool32(¶ms.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod); + ret = 0; + if (!mod) + goto out_ops; + + ret = dev->ethtool_ops->set_pauseparam(dev, ¶ms); + if (ret < 0) + goto out_ops; + ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); +out_dev: + dev_put(dev); + return ret; +} diff --git a/net/ethtool/privflags.c b/net/ethtool/privflags.c index e8f03b33db9b..77447dceb109 100644 --- a/net/ethtool/privflags.c +++ b/net/ethtool/privflags.c @@ -175,9 +175,10 @@ int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info) return ret; dev = req_info.dev; ops = dev->ethtool_ops; + ret = -EOPNOTSUPP; if (!ops->get_priv_flags || !ops->set_priv_flags || !ops->get_sset_count || !ops->get_strings) - return -EOPNOTSUPP; + goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); @@ -204,6 +205,7 @@ out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); +out_dev: dev_put(dev); return ret; } diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 8e5911887b4c..95eae5c68a52 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -60,6 +60,21 @@ static const struct strset_info info_template[] = { .count = WOL_MODE_COUNT, .strings = wol_mode_names, }, + [ETH_SS_SOF_TIMESTAMPING] = { + .per_dev = false, + .count = __SOF_TIMESTAMPING_CNT, + .strings = sof_timestamping_names, + }, + [ETH_SS_TS_TX_TYPES] = { + .per_dev = false, + .count = __HWTSTAMP_TX_CNT, + .strings = ts_tx_type_names, + }, + [ETH_SS_TS_RX_FILTERS] = { + .per_dev = false, + .count = __HWTSTAMP_FILTER_CNT, + .strings = ts_rx_filter_names, + }, }; struct strset_req_info { diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c new file mode 100644 index 000000000000..7cb5b512b77c --- /dev/null +++ b/net/ethtool/tsinfo.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/net_tstamp.h> + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct tsinfo_req_info { + struct ethnl_req_info base; +}; + +struct tsinfo_reply_data { + struct ethnl_reply_data base; + struct ethtool_ts_info ts_info; +}; + +#define TSINFO_REPDATA(__reply_base) \ + container_of(__reply_base, struct tsinfo_reply_data, base) + +static const struct nla_policy +tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1] = { + [ETHTOOL_A_TSINFO_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_TSINFO_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_TSINFO_TIMESTAMPING] = { .type = NLA_REJECT }, + [ETHTOOL_A_TSINFO_TX_TYPES] = { .type = NLA_REJECT }, + [ETHTOOL_A_TSINFO_RX_FILTERS] = { .type = NLA_REJECT }, + [ETHTOOL_A_TSINFO_PHC_INDEX] = { .type = NLA_REJECT }, +}; + +static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + ret = __ethtool_get_ts_info(dev, &data->ts_info); + ethnl_ops_complete(dev); + + return ret; +} + +static int tsinfo_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct ethtool_ts_info *ts_info = &data->ts_info; + int len = 0; + int ret; + + BUILD_BUG_ON(__SOF_TIMESTAMPING_CNT > 32); + BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32); + BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32); + + if (ts_info->so_timestamping) { + ret = ethnl_bitset32_size(&ts_info->so_timestamping, NULL, + __SOF_TIMESTAMPING_CNT, + sof_timestamping_names, compact); + if (ret < 0) + return ret; + len += ret; /* _TSINFO_TIMESTAMPING */ + } + if (ts_info->tx_types) { + ret = ethnl_bitset32_size(&ts_info->tx_types, NULL, + __HWTSTAMP_TX_CNT, + ts_tx_type_names, compact); + if (ret < 0) + return ret; + len += ret; /* _TSINFO_TX_TYPES */ + } + if (ts_info->rx_filters) { + ret = ethnl_bitset32_size(&ts_info->rx_filters, NULL, + __HWTSTAMP_FILTER_CNT, + ts_rx_filter_names, compact); + if (ret < 0) + return ret; + len += ret; /* _TSINFO_RX_FILTERS */ + } + if (ts_info->phc_index >= 0) + len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */ + + return len; +} + +static int tsinfo_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct ethtool_ts_info *ts_info = &data->ts_info; + int ret; + + if (ts_info->so_timestamping) { + ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TIMESTAMPING, + &ts_info->so_timestamping, NULL, + __SOF_TIMESTAMPING_CNT, + sof_timestamping_names, compact); + if (ret < 0) + return ret; + } + if (ts_info->tx_types) { + ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TX_TYPES, + &ts_info->tx_types, NULL, + __HWTSTAMP_TX_CNT, + ts_tx_type_names, compact); + if (ret < 0) + return ret; + } + if (ts_info->rx_filters) { + ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_RX_FILTERS, + &ts_info->rx_filters, NULL, + __HWTSTAMP_FILTER_CNT, + ts_rx_filter_names, compact); + if (ret < 0) + return ret; + } + if (ts_info->phc_index >= 0 && + nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_tsinfo_request_ops = { + .request_cmd = ETHTOOL_MSG_TSINFO_GET, + .reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY, + .hdr_attr = ETHTOOL_A_TSINFO_HEADER, + .max_attr = ETHTOOL_A_TSINFO_MAX, + .req_info_size = sizeof(struct tsinfo_req_info), + .reply_data_size = sizeof(struct tsinfo_reply_data), + .request_policy = tsinfo_get_policy, + + .prepare_data = tsinfo_prepare_data, + .reply_size = tsinfo_reply_size, + .fill_reply = tsinfo_fill_reply, +}; diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c index 1d2bcabee554..1798421e9f1c 100644 --- a/net/ethtool/wol.c +++ b/net/ethtool/wol.c @@ -129,8 +129,9 @@ int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; dev = req_info.dev; + ret = -EOPNOTSUPP; if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol) - return -EOPNOTSUPP; + goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); @@ -173,6 +174,7 @@ out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); +out_dev: dev_put(dev); return ret; } diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index d46d22c7105c..03b891904314 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -483,12 +483,9 @@ int hsr_get_node_data(struct hsr_priv *hsr, struct hsr_port *port; unsigned long tdiff; - rcu_read_lock(); node = find_node_by_addr_A(&hsr->node_db, addr); - if (!node) { - rcu_read_unlock(); - return -ENOENT; /* No such entry */ - } + if (!node) + return -ENOENT; ether_addr_copy(addr_b, node->macaddress_B); @@ -523,7 +520,5 @@ int hsr_get_node_data(struct hsr_priv *hsr, *addr_b_ifindex = -1; } - rcu_read_unlock(); - return 0; } diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 64d39c1e93a2..5465a395da04 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -250,15 +250,16 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) if (!na) goto invalid; - hsr_dev = __dev_get_by_index(genl_info_net(info), - nla_get_u32(info->attrs[HSR_A_IFINDEX])); + rcu_read_lock(); + hsr_dev = dev_get_by_index_rcu(genl_info_net(info), + nla_get_u32(info->attrs[HSR_A_IFINDEX])); if (!hsr_dev) - goto invalid; + goto rcu_unlock; if (!is_hsr_master(hsr_dev)) - goto invalid; + goto rcu_unlock; /* Send reply */ - skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb_out) { res = -ENOMEM; goto fail; @@ -312,12 +313,10 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) res = nla_put_u16(skb_out, HSR_A_IF1_SEQ, hsr_node_if1_seq); if (res < 0) goto nla_put_failure; - rcu_read_lock(); port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A); if (port) res = nla_put_u32(skb_out, HSR_A_IF1_IFINDEX, port->dev->ifindex); - rcu_read_unlock(); if (res < 0) goto nla_put_failure; @@ -327,20 +326,22 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) res = nla_put_u16(skb_out, HSR_A_IF2_SEQ, hsr_node_if2_seq); if (res < 0) goto nla_put_failure; - rcu_read_lock(); port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); if (port) res = nla_put_u32(skb_out, HSR_A_IF2_IFINDEX, port->dev->ifindex); - rcu_read_unlock(); if (res < 0) goto nla_put_failure; + rcu_read_unlock(); + genlmsg_end(skb_out, msg_head); genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid); return 0; +rcu_unlock: + rcu_read_unlock(); invalid: netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL); return 0; @@ -350,6 +351,7 @@ nla_put_failure: /* Fall through */ fail: + rcu_read_unlock(); return res; } @@ -357,16 +359,14 @@ fail: */ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) { - /* For receiving */ - struct nlattr *na; + unsigned char addr[ETH_ALEN]; struct net_device *hsr_dev; - - /* For sending */ struct sk_buff *skb_out; - void *msg_head; struct hsr_priv *hsr; - void *pos; - unsigned char addr[ETH_ALEN]; + bool restart = false; + struct nlattr *na; + void *pos = NULL; + void *msg_head; int res; if (!info) @@ -376,15 +376,17 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) if (!na) goto invalid; - hsr_dev = __dev_get_by_index(genl_info_net(info), - nla_get_u32(info->attrs[HSR_A_IFINDEX])); + rcu_read_lock(); + hsr_dev = dev_get_by_index_rcu(genl_info_net(info), + nla_get_u32(info->attrs[HSR_A_IFINDEX])); if (!hsr_dev) - goto invalid; + goto rcu_unlock; if (!is_hsr_master(hsr_dev)) - goto invalid; + goto rcu_unlock; +restart: /* Send reply */ - skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + skb_out = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb_out) { res = -ENOMEM; goto fail; @@ -398,18 +400,26 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) goto nla_put_failure; } - res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex); - if (res < 0) - goto nla_put_failure; + if (!restart) { + res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex); + if (res < 0) + goto nla_put_failure; + } hsr = netdev_priv(hsr_dev); - rcu_read_lock(); - pos = hsr_get_next_node(hsr, NULL, addr); + if (!pos) + pos = hsr_get_next_node(hsr, NULL, addr); while (pos) { res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr); if (res < 0) { - rcu_read_unlock(); + if (res == -EMSGSIZE) { + genlmsg_end(skb_out, msg_head); + genlmsg_unicast(genl_info_net(info), skb_out, + info->snd_portid); + restart = true; + goto restart; + } goto nla_put_failure; } pos = hsr_get_next_node(hsr, pos, addr); @@ -421,15 +431,18 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) return 0; +rcu_unlock: + rcu_read_unlock(); invalid: netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL); return 0; nla_put_failure: - kfree_skb(skb_out); + nlmsg_free(skb_out); /* Fall through */ fail: + rcu_read_unlock(); return res; } @@ -456,6 +469,7 @@ static struct genl_family hsr_genl_family __ro_after_init = { .version = 1, .maxattr = HSR_A_MAX, .policy = hsr_genl_policy, + .netnsok = true, .module = THIS_MODULE, .ops = hsr_ops, .n_ops = ARRAY_SIZE(hsr_ops), diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index d3547e8c6d5b..f4b9f7a3ce51 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -151,16 +151,16 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, if (!port) return -ENOMEM; + port->hsr = hsr; + port->dev = dev; + port->type = type; + if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); if (res) goto fail_dev_setup; } - port->hsr = hsr; - port->dev = dev; - port->type = type; - list_add_tail_rcu(&port->port_list, &hsr->ports); synchronize_rcu(); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index f96bd489b362..6490b845e17b 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -303,6 +303,7 @@ config SYN_COOKIES config NET_IPVTI tristate "Virtual (secure) IP: tunneling" + depends on IPV6 || IPV6=n select INET_TUNNEL select NET_IP_TUNNEL select XFRM diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index bd7b4e92e07f..cf58e29cf746 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1793,6 +1793,10 @@ static __net_exit void ipv4_mib_exit_net(struct net *net) free_percpu(net->mib.net_statistics); free_percpu(net->mib.ip_statistics); free_percpu(net->mib.tcp_statistics); +#ifdef CONFIG_MPTCP + /* allocated on demand, see mptcp_init_sock() */ + free_percpu(net->mib.mptcp_statistics); +#endif } static __net_initdata struct pernet_operations ipv4_mib_ops = { diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 0fd8bfde2448..e3939f76b024 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -217,7 +217,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, { const struct tcp_congestion_ops *utcp_ca; struct tcp_congestion_ops *tcp_ca; - size_t tcp_ca_name_len; int prog_fd; u32 moff; @@ -232,13 +231,11 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, tcp_ca->flags = utcp_ca->flags; return 1; case offsetof(struct tcp_congestion_ops, name): - tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name)); - if (!tcp_ca_name_len || - tcp_ca_name_len == sizeof(utcp_ca->name)) + if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name, + sizeof(tcp_ca->name)) <= 0) return -EINVAL; if (tcp_ca_find(utcp_ca->name)) return -EEXIST; - memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name)); return 1; } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 103c7d599a3c..8b07f3a4f2db 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -341,22 +341,6 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err) esp_output_done(base, err); } -static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) -{ - /* Fill padding... */ - if (tfclen) { - memset(tail, 0, tfclen); - tail += tfclen; - } - do { - int i; - for (i = 0; i < plen - 2; i++) - tail[i] = i + 1; - } while (0); - tail[plen - 2] = plen - 2; - tail[plen - 1] = proto; -} - static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb, int encap_type, struct esp_info *esp, diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index e2e219c7854a..731022cff600 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -132,6 +132,36 @@ static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, return segs; } +static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + struct sk_buff *segs = ERR_PTR(-EINVAL); + const struct net_offload *ops; + int proto = xo->proto; + + skb->transport_header += x->props.header_len; + + if (proto == IPPROTO_BEETPH) { + struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data; + + skb->transport_header += ph->hdrlen * 8; + proto = ph->nexthdr; + } else if (x->sel.family != AF_INET6) { + skb->transport_header -= IPV4_BEET_PHMAXLEN; + } else if (proto == IPPROTO_TCP) { + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; + } + + __skb_pull(skb, skb_transport_offset(skb)); + ops = rcu_dereference(inet_offloads[proto]); + if (likely(ops && ops->callbacks.gso_segment)) + segs = ops->callbacks.gso_segment(skb, features); + + return segs; +} + static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) @@ -141,6 +171,8 @@ static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x, return xfrm4_tunnel_gso_segment(x, skb, features); case XFRM_MODE_TRANSPORT: return xfrm4_transport_gso_segment(x, skb, features); + case XFRM_MODE_BEET: + return xfrm4_beet_gso_segment(x, skb, features); } return ERR_PTR(-EOPNOTSUPP); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 577db1d50a24..213be9c050ad 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -997,7 +997,9 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) return -ENOENT; } + rcu_read_lock(); err = fib_table_dump(tb, skb, cb, &filter); + rcu_read_unlock(); return skb->len ? : err; } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index c092e9a55790..818916b2a04d 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -35,7 +35,7 @@ static inline void fib_alias_accessed(struct fib_alias *fa) void fib_release_info(struct fib_info *); struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack); -int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, +int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e4c62b8f57a8..6ed8c9317179 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -570,8 +570,9 @@ static int fib_detect_death(struct fib_info *fi, int order, return 1; } -int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap, - u16 encap_type, void *cfg, gfp_t gfp_flags, +int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc, + struct nlattr *encap, u16 encap_type, + void *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { int err; @@ -589,8 +590,9 @@ int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap, err = -EINVAL; goto lwt_failure; } - err = lwtunnel_build_state(encap_type, encap, nhc->nhc_family, - cfg, &lwtstate, extack); + err = lwtunnel_build_state(net, encap_type, encap, + nhc->nhc_family, cfg, &lwtstate, + extack); if (err) goto lwt_failure; @@ -614,7 +616,7 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, nh->fib_nh_family = AF_INET; - err = fib_nh_common_init(&nh->nh_common, cfg->fc_encap, + err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap, cfg->fc_encap_type, cfg, GFP_KERNEL, extack); if (err) return err; @@ -814,7 +816,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, #endif /* CONFIG_IP_ROUTE_MULTIPATH */ -static int fib_encap_match(u16 encap_type, +static int fib_encap_match(struct net *net, u16 encap_type, struct nlattr *encap, const struct fib_nh *nh, const struct fib_config *cfg, @@ -826,7 +828,7 @@ static int fib_encap_match(u16 encap_type, if (encap_type == LWTUNNEL_ENCAP_NONE) return 0; - ret = lwtunnel_build_state(encap_type, encap, AF_INET, + ret = lwtunnel_build_state(net, encap_type, encap, AF_INET, cfg, &lwtstate, extack); if (!ret) { result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws); @@ -836,7 +838,7 @@ static int fib_encap_match(u16 encap_type, return result; } -int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, +int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack) { #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -857,8 +859,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, struct fib_nh *nh = fib_info_nh(fi, 0); if (cfg->fc_encap) { - if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, - nh, cfg, extack)) + if (fib_encap_match(net, cfg->fc_encap_type, + cfg->fc_encap, nh, cfg, extack)) return 1; } #ifdef CONFIG_IP_ROUTE_CLASSID diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index f4c2ac445b3f..b01b748df9dd 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1679,7 +1679,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && - fib_nh_match(cfg, fi, extack) == 0 && + fib_nh_match(net, cfg, fi, extack) == 0 && fib_metrics_match(cfg, fi)) { fa_to_delete = fa; break; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 8274f98c511c..029b24eeafba 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1153,6 +1153,24 @@ static int ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_FWMARK]) *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); + return 0; +} + +static int erspan_netlink_parms(struct net_device *dev, + struct nlattr *data[], + struct nlattr *tb[], + struct ip_tunnel_parm *parms, + __u32 *fwmark) +{ + struct ip_tunnel *t = netdev_priv(dev); + int err; + + err = ipgre_netlink_parms(dev, data, tb, parms, fwmark); + if (err) + return err; + if (!data) + return 0; + if (data[IFLA_GRE_ERSPAN_VER]) { t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); @@ -1276,45 +1294,70 @@ static void ipgre_tap_setup(struct net_device *dev) ip_tunnel_setup(dev, gre_tap_net_id); } -static int ipgre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) +static int +ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[]) { - struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; - __u32 fwmark = 0; - int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); - err = ip_tunnel_encap_setup(t, &ipencap); + int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } + return 0; +} + +static int ipgre_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip_tunnel_parm p; + __u32 fwmark = 0; + int err; + + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; + err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; return ip_tunnel_newlink(dev, tb, &p, fwmark); } +static int erspan_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip_tunnel_parm p; + __u32 fwmark = 0; + int err; + + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; + + err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); + if (err) + return err; + return ip_tunnel_newlink(dev, tb, &p, fwmark); +} + static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_encap ipencap; __u32 fwmark = t->fwmark; struct ip_tunnel_parm p; int err; - if (ipgre_netlink_encap_parms(data, &ipencap)) { - err = ip_tunnel_encap_setup(t, &ipencap); - - if (err < 0) - return err; - } + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) @@ -1327,8 +1370,34 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], t->parms.i_flags = p.i_flags; t->parms.o_flags = p.o_flags; - if (strcmp(dev->rtnl_link_ops->kind, "erspan")) - ipgre_link_update(dev, !tb[IFLA_MTU]); + ipgre_link_update(dev, !tb[IFLA_MTU]); + + return 0; +} + +static int erspan_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct ip_tunnel *t = netdev_priv(dev); + __u32 fwmark = t->fwmark; + struct ip_tunnel_parm p; + int err; + + err = ipgre_newlink_encap_setup(dev, data); + if (err) + return err; + + err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); + if (err < 0) + return err; + + err = ip_tunnel_changelink(dev, tb, &p, fwmark); + if (err < 0) + return err; + + t->parms.i_flags = p.i_flags; + t->parms.o_flags = p.o_flags; return 0; } @@ -1519,8 +1588,8 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = { .priv_size = sizeof(struct ip_tunnel), .setup = erspan_setup, .validate = erspan_validate, - .newlink = ipgre_newlink, - .changelink = ipgre_changelink, + .newlink = erspan_newlink, + .changelink = erspan_changelink, .dellink = ip_tunnel_dellink, .get_size = ipgre_get_size, .fill_info = ipgre_fill_info, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index aaaaf907e0d8..090d3097ee15 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -263,7 +263,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, * insufficent MTU. */ features = netif_skb_features(skb); - BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); + BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 47f8b947eef1..181b7a2a0247 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -432,7 +432,7 @@ static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info, return ip_tun_parse_opts(attr, info, extack); } -static int ip_tun_build_state(struct nlattr *attr, +static int ip_tun_build_state(struct net *net, struct nlattr *attr, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) @@ -719,7 +719,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { [LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED }, }; -static int ip6_tun_build_state(struct nlattr *attr, +static int ip6_tun_build_state(struct net *net, struct nlattr *attr, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 37cddd18f282..1b4e6f298648 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -187,17 +187,39 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, int mtu; if (!dst) { - struct rtable *rt; - - fl->u.ip4.flowi4_oif = dev->ifindex; - fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; - rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); - if (IS_ERR(rt)) { + switch (skb->protocol) { + case htons(ETH_P_IP): { + struct rtable *rt; + + fl->u.ip4.flowi4_oif = dev->ifindex; + fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + dst = &rt->dst; + skb_dst_set(skb, dst); + break; + } +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + fl->u.ip6.flowi6_oif = dev->ifindex; + fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); + if (dst->error) { + dst_release(dst); + dst = NULL; + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; + } + skb_dst_set(skb, dst); + break; +#endif + default: dev->stats.tx_carrier_errors++; goto tx_error_icmp; } - dst = &rt->dst; - skb_dst_set(skb, dst); } dst_hold(dst); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index f1f78a742b36..b167f4a5b684 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1057,7 +1057,7 @@ struct compat_arpt_replace { u32 underflow[NF_ARP_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; - struct compat_arpt_entry entries[0]; + struct compat_arpt_entry entries[]; }; static inline void compat_release_entry(struct compat_arpt_entry *e) @@ -1383,7 +1383,7 @@ static int compat_copy_entries_to_user(unsigned int total_size, struct compat_arpt_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; - struct compat_arpt_entry entrytable[0]; + struct compat_arpt_entry entrytable[]; }; static int compat_get_entries(struct net *net, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 10b91ebdf213..c2670eaa74e6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1211,7 +1211,7 @@ struct compat_ipt_replace { u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; /* struct xt_counters * */ - struct compat_ipt_entry entries[0]; + struct compat_ipt_entry entries[]; }; static int @@ -1562,7 +1562,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, struct compat_ipt_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; - struct compat_ipt_entry entrytable[0]; + struct compat_ipt_entry entrytable[]; }; static int diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 2580303249e2..75545a829a2b 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -32,6 +32,7 @@ #include <net/icmp.h> #include <net/protocol.h> #include <net/tcp.h> +#include <net/mptcp.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/bottom_half.h> @@ -485,6 +486,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) offsetof(struct ipstats_mib, syncp))); seq_putc(seq, '\n'); + mptcp_seq_show(seq); return 0; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 042599cc691d..788c69d9bfe0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1621,12 +1621,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, - bool nopolicy, bool noxfrm, bool will_cache) + bool nopolicy, bool noxfrm) { struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - (will_cache ? 0 : DST_HOST) | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); @@ -1674,7 +1673,6 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) new_rt->rt_gw6 = rt->rt_gw6; INIT_LIST_HEAD(&new_rt->rt_uncached); - new_rt->dst.flags |= DST_HOST; new_rt->dst.input = rt->dst.input; new_rt->dst.output = rt->dst.output; new_rt->dst.error = rt->dst.error; @@ -1734,7 +1732,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, flags |= RTCF_LOCAL; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); + IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) return -ENOBUFS; @@ -1851,7 +1849,7 @@ static int __mkroute_input(struct sk_buff *skb, rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); + IN_DEV_CONF_GET(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; goto cleanup; @@ -2219,7 +2217,7 @@ local_input: rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, flags | RTCF_LOCAL, res->type, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); + IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; @@ -2443,8 +2441,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, add: rth = rt_dst_alloc(dev_out, flags, type, IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(in_dev, NOXFRM), - do_cache); + IN_DEV_CONF_GET(in_dev, NOXFRM)); if (!rth) return ERR_PTR(-ENOBUFS); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5c57850fab4b..6d87de434377 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2948,8 +2948,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = -EPERM; else if (tp->repair_queue == TCP_SEND_QUEUE) WRITE_ONCE(tp->write_seq, val); - else if (tp->repair_queue == TCP_RECV_QUEUE) + else if (tp->repair_queue == TCP_RECV_QUEUE) { WRITE_ONCE(tp->rcv_nxt, val); + WRITE_ONCE(tp->copied_seq, val); + } else err = -EINVAL; break; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 645cc3009e64..f5f588b1f6e9 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -145,12 +145,13 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tcp_in_slow_start(tp)) - tcp_slow_start(tp, acked); - else { - bictcp_update(ca, tp->snd_cwnd); - tcp_cong_avoid_ai(tp, ca->cnt, 1); + if (tcp_in_slow_start(tp)) { + acked = tcp_slow_start(tp, acked); + if (!acked) + return; } + bictcp_update(ca, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, ca->cnt, acked); } /* diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 03af7c3e75ef..7e40322cc5ec 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -774,6 +774,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; + if (own_req && sk_is_mptcp(child) && mptcp_sk_is_subflow(child)) { + reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); + inet_csk_reqsk_queue_drop_and_put(sk, req); + return child; + } + sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); *req_stolen = !own_req; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 306e25d743e8..2f45cde168c4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1109,6 +1109,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (unlikely(!skb)) return -ENOBUFS; + /* retransmit skbs might have a non zero value in skb->dev + * because skb->dev is aliased with skb->rbnode.rb_left + */ + skb->dev = NULL; } inet = inet_sk(sk); @@ -3037,8 +3041,12 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) tcp_skb_tsorted_save(skb) { nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + if (nskb) { + nskb->dev = NULL; + err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC); + } else { + err = -ENOBUFS; + } } tcp_skb_tsorted_restore(skb); if (!err) { diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 471571e1ab26..6cebf412d590 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -10,10 +10,9 @@ #include <net/tcp.h> /* These factors derived from the recommended values in the aer: - * .01 and and 7/8. We use 50 instead of 100 to account for - * delayed ack. + * .01 and and 7/8. */ -#define TCP_SCALABLE_AI_CNT 50U +#define TCP_SCALABLE_AI_CNT 100U #define TCP_SCALABLE_MD_SCALE 3 static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) @@ -23,11 +22,13 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tcp_in_slow_start(tp)) - tcp_slow_start(tp, acked); - else - tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), - 1); + if (tcp_in_slow_start(tp)) { + acked = tcp_slow_start(tp, acked); + if (!acked) + return; + } + tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), + acked); } static u32 tcp_scalable_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 3b36bb1a0dda..50a9a6e2c4cd 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -153,31 +153,34 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; if (tcp_in_slow_start(tp)) { - /* Slow start. */ - tcp_slow_start(tp, acked); + /* Slow start. */ + acked = tcp_slow_start(tp, acked); + if (!acked) + goto done; + } + + /* Congestion avoidance. */ + if (veno->diff < beta) { + /* In the "non-congestive state", increase cwnd + * every rtt. + */ + tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); } else { - /* Congestion avoidance. */ - if (veno->diff < beta) { - /* In the "non-congestive state", increase cwnd - * every rtt. - */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); - } else { - /* In the "congestive state", increase cwnd - * every other rtt. - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (veno->inc && - tp->snd_cwnd < tp->snd_cwnd_clamp) { - tp->snd_cwnd++; - veno->inc = 0; - } else - veno->inc = 1; - tp->snd_cwnd_cnt = 0; + /* In the "congestive state", increase cwnd + * every other rtt. + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (veno->inc && + tp->snd_cwnd < tp->snd_cwnd_clamp) { + tp->snd_cwnd++; + veno->inc = 0; } else - tp->snd_cwnd_cnt++; - } + veno->inc = 1; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt += acked; } +done: if (tp->snd_cwnd < 2) tp->snd_cwnd = 2; else if (tp->snd_cwnd > tp->snd_cwnd_clamp) diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index e00570dd0a69..3bb448761ca3 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -36,8 +36,6 @@ struct yeah { u32 reno_count; u32 fast_count; - - u32 pkts_acked; }; static void tcp_yeah_init(struct sock *sk) @@ -57,18 +55,6 @@ static void tcp_yeah_init(struct sock *sk) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void tcp_yeah_pkts_acked(struct sock *sk, - const struct ack_sample *sample) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct yeah *yeah = inet_csk_ca(sk); - - if (icsk->icsk_ca_state == TCP_CA_Open) - yeah->pkts_acked = sample->pkts_acked; - - tcp_vegas_pkts_acked(sk, sample); -} - static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -77,24 +63,19 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tcp_in_slow_start(tp)) - tcp_slow_start(tp, acked); + if (tcp_in_slow_start(tp)) { + acked = tcp_slow_start(tp, acked); + if (!acked) + goto do_vegas; + } - else if (!yeah->doing_reno_now) { + if (!yeah->doing_reno_now) { /* Scalable */ - - tp->snd_cwnd_cnt += yeah->pkts_acked; - if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } - - yeah->pkts_acked = 1; - + tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), + acked); } else { /* Reno */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); + tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); } /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. @@ -118,7 +99,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) * of bytes we send in an RTT is often less than our cwnd will allow. * So we keep track of our cwnd separately, in v_beg_snd_cwnd. */ - +do_vegas: if (after(ack, yeah->vegas.beg_snd_nxt)) { /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got @@ -232,7 +213,7 @@ static struct tcp_congestion_ops tcp_yeah __read_mostly = { .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, .get_info = tcp_vegas_get_info, - .pkts_acked = tcp_yeah_pkts_acked, + .pkts_acked = tcp_vegas_pkts_acked, .owner = THIS_MODULE, .name = "yeah", diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b4035021bbd3..32564b350823 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2109,7 +2109,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (likely(!udp_unexpected_gso(sk, skb))) return udp_queue_rcv_one_skb(sk, skb); - BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET); + BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET); __skb_push(skb, -skb_mac_offset(skb)); segs = udp_rcv_segment(sk, skb, true); skb_list_walk_safe(segs, skb, next) { diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index ae1344e4cec5..2ccaee98fddb 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -303,4 +303,14 @@ config IPV6_SEG6_BPF depends on IPV6_SEG6_LWTUNNEL depends on IPV6 = y +config IPV6_RPL_LWTUNNEL + bool "IPv6: RPL Source Routing Header support" + depends on IPV6 + select LWTUNNEL + ---help--- + Support for RFC6554 RPL Source Routing Header using the lightweight + tunnels mechanism. + + If unsure, say N. + endif # IPV6 diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 8ccf35514015..cf7b47bdb9b3 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -10,7 +10,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \ - udp_offload.o seg6.o fib6_notifier.o + udp_offload.o seg6.o fib6_notifier.o rpl.o ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o @@ -26,6 +26,7 @@ ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o ipv6-$(CONFIG_NETLABEL) += calipso.o ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o +ipv6-$(CONFIG_IPV6_RPL_LWTUNNEL) += rpl_iptunnel.o ipv6-objs += $(ipv6-y) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5b9de773ce73..a11fd4d67832 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -236,6 +236,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .enhanced_dad = 1, .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, .disable_policy = 0, + .rpl_seg_enabled = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -290,6 +291,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .enhanced_dad = 1, .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, .disable_policy = 0, + .rpl_seg_enabled = 0, }; /* Check if link is ready: is it up and is a valid qdisc available */ @@ -4398,6 +4400,59 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) } #endif +/* RFC6554 has some algorithm to avoid loops in segment routing by + * checking if the segments contains any of a local interface address. + * + * Quote: + * + * To detect loops in the SRH, a router MUST determine if the SRH + * includes multiple addresses assigned to any interface on that router. + * If such addresses appear more than once and are separated by at least + * one address not assigned to that router. + */ +int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs, + unsigned char nsegs) +{ + const struct in6_addr *addr; + int i, ret = 0, found = 0; + struct inet6_ifaddr *ifp; + bool separated = false; + unsigned int hash; + bool hash_found; + + rcu_read_lock(); + for (i = 0; i < nsegs; i++) { + addr = &segs[i]; + hash = inet6_addr_hash(net, addr); + + hash_found = false; + hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + + if (ipv6_addr_equal(&ifp->addr, addr)) { + hash_found = true; + break; + } + } + + if (hash_found) { + if (found > 1 && separated) { + ret = 1; + break; + } + + separated = false; + found++; + } else { + separated = true; + } + } + rcu_read_unlock(); + + return ret; +} + /* * Periodic address status verification */ @@ -5467,6 +5522,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode; array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy; array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass; + array[DEVCONF_RPL_SEG_ENABLED] = cnf->rpl_seg_enabled; } static inline size_t inet6_ifla6_size(void) @@ -6848,6 +6904,13 @@ static const struct ctl_table addrconf_sysctl[] = { .extra2 = (void *)&two_five_five, }, { + .procname = "rpl_seg_enabled", + .data = &ipv6_devconf.rpl_seg_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { /* sentinel */ } }; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d727c3b41495..345baa0a754f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -59,6 +59,7 @@ #endif #include <net/calipso.h> #include <net/seg6.h> +#include <net/rpl.h> #include <linux/uaccess.h> #include <linux/mroute6.h> @@ -1114,6 +1115,10 @@ static int __init inet6_init(void) if (err) goto seg6_fail; + err = rpl_init(); + if (err) + goto rpl_fail; + err = igmp6_late_init(); if (err) goto igmp6_late_err; @@ -1136,6 +1141,8 @@ sysctl_fail: igmp6_late_cleanup(); #endif igmp6_late_err: + rpl_exit(); +rpl_fail: seg6_exit(); seg6_fail: calipso_exit(); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index a3b403ba8f8f..11143d039f16 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -207,22 +207,6 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err) esp_output_done(base, err); } -static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) -{ - /* Fill padding... */ - if (tfclen) { - memset(tail, 0, tfclen); - tail += tfclen; - } - do { - int i; - for (i = 0; i < plen - 2; i++) - tail[i] = i + 1; - } while (0); - tail[plen - 2] = plen - 2; - tail[plen - 1] = proto; -} - int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { u8 *tail; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index fd535053245b..8eab2c869d61 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -159,6 +159,40 @@ static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x, return segs; } +static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + struct sk_buff *segs = ERR_PTR(-EINVAL); + const struct net_offload *ops; + int proto = xo->proto; + + skb->transport_header += x->props.header_len; + + if (proto == IPPROTO_BEETPH) { + struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data; + + skb->transport_header += ph->hdrlen * 8; + proto = ph->nexthdr; + } + + if (x->sel.family != AF_INET6) { + skb->transport_header -= + (sizeof(struct ipv6hdr) - sizeof(struct iphdr)); + + if (proto == IPPROTO_TCP) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; + } + + __skb_pull(skb, skb_transport_offset(skb)); + ops = rcu_dereference(inet6_offloads[proto]); + if (likely(ops && ops->callbacks.gso_segment)) + segs = ops->callbacks.gso_segment(skb, features); + + return segs; +} + static struct sk_buff *xfrm6_outer_mode_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) @@ -168,6 +202,8 @@ static struct sk_buff *xfrm6_outer_mode_gso_segment(struct xfrm_state *x, return xfrm6_tunnel_gso_segment(x, skb, features); case XFRM_MODE_TRANSPORT: return xfrm6_transport_gso_segment(x, skb, features); + case XFRM_MODE_BEET: + return xfrm6_beet_gso_segment(x, skb, features); } return ERR_PTR(-EOPNOTSUPP); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index bcb9f5e62808..5a8bbcdcaf2b 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -48,6 +48,7 @@ #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif +#include <net/rpl.h> #include <linux/uaccess.h> @@ -468,6 +469,195 @@ looped_back: return -1; } +static int ipv6_rpl_srh_rcv(struct sk_buff *skb) +{ + struct ipv6_rpl_sr_hdr *hdr, *ohdr, *chdr; + struct inet6_skb_parm *opt = IP6CB(skb); + struct net *net = dev_net(skb->dev); + struct inet6_dev *idev; + struct ipv6hdr *oldhdr; + struct in6_addr addr; + unsigned char *buf; + int accept_rpl_seg; + int i, err; + u64 n = 0; + u32 r; + + idev = __in6_dev_get(skb->dev); + + accept_rpl_seg = net->ipv6.devconf_all->rpl_seg_enabled; + if (accept_rpl_seg > idev->cnf.rpl_seg_enabled) + accept_rpl_seg = idev->cnf.rpl_seg_enabled; + + if (!accept_rpl_seg) { + kfree_skb(skb); + return -1; + } + +looped_back: + hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb); + + if (hdr->segments_left == 0) { + if (hdr->nexthdr == NEXTHDR_IPV6) { + int offset = (hdr->hdrlen + 1) << 3; + + skb_postpull_rcsum(skb, skb_network_header(skb), + skb_network_header_len(skb)); + + if (!pskb_pull(skb, offset)) { + kfree_skb(skb); + return -1; + } + skb_postpull_rcsum(skb, skb_transport_header(skb), + offset); + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb->encapsulation = 0; + + __skb_tunnel_rx(skb, skb->dev, net); + + netif_rx(skb); + return -1; + } + + opt->srcrt = skb_network_header_len(skb); + opt->lastopt = opt->srcrt; + skb->transport_header += (hdr->hdrlen + 1) << 3; + opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); + + return 1; + } + + if (!pskb_may_pull(skb, sizeof(*hdr))) { + kfree_skb(skb); + return -1; + } + + n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre); + r = do_div(n, (16 - hdr->cmpri)); + /* checks if calculation was without remainder and n fits into + * unsigned char which is segments_left field. Should not be + * higher than that. + */ + if (r || (n + 1) > 255) { + kfree_skb(skb); + return -1; + } + + if (hdr->segments_left > n + 1) { + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + ((&hdr->segments_left) - + skb_network_header(skb))); + return -1; + } + + if (skb_cloned(skb)) { + if (pskb_expand_head(skb, IPV6_RPL_SRH_WORST_SWAP_SIZE, 0, + GFP_ATOMIC)) { + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return -1; + } + } else { + err = skb_cow_head(skb, IPV6_RPL_SRH_WORST_SWAP_SIZE); + if (unlikely(err)) { + kfree_skb(skb); + return -1; + } + } + + hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb); + + if (!pskb_may_pull(skb, ipv6_rpl_srh_size(n, hdr->cmpri, + hdr->cmpre))) { + kfree_skb(skb); + return -1; + } + + hdr->segments_left--; + i = n - hdr->segments_left; + + buf = kzalloc(ipv6_rpl_srh_alloc_size(n + 1) * 2, GFP_ATOMIC); + if (unlikely(!buf)) { + kfree_skb(skb); + return -1; + } + + ohdr = (struct ipv6_rpl_sr_hdr *)buf; + ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n); + chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3)); + + if ((ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST) || + (ipv6_addr_type(&ohdr->rpl_segaddr[i]) & IPV6_ADDR_MULTICAST)) { + kfree_skb(skb); + kfree(buf); + return -1; + } + + err = ipv6_chk_rpl_srh_loop(net, ohdr->rpl_segaddr, n + 1); + if (err) { + icmpv6_send(skb, ICMPV6_PARAMPROB, 0, 0); + kfree_skb(skb); + kfree(buf); + return -1; + } + + addr = ipv6_hdr(skb)->daddr; + ipv6_hdr(skb)->daddr = ohdr->rpl_segaddr[i]; + ohdr->rpl_segaddr[i] = addr; + + ipv6_rpl_srh_compress(chdr, ohdr, &ipv6_hdr(skb)->daddr, n); + + oldhdr = ipv6_hdr(skb); + + skb_pull(skb, ((hdr->hdrlen + 1) << 3)); + skb_postpull_rcsum(skb, oldhdr, + sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3)); + skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + memmove(ipv6_hdr(skb), oldhdr, sizeof(struct ipv6hdr)); + memcpy(skb_transport_header(skb), chdr, (chdr->hdrlen + 1) << 3); + + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_postpush_rcsum(skb, ipv6_hdr(skb), + sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3)); + + kfree(buf); + + skb_dst_drop(skb); + + ip6_route_input(skb); + + if (skb_dst(skb)->error) { + dst_input(skb); + return -1; + } + + if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { + if (ipv6_hdr(skb)->hop_limit <= 1) { + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); + icmpv6_send(skb, ICMPV6_TIME_EXCEED, + ICMPV6_EXC_HOPLIMIT, 0); + kfree_skb(skb); + return -1; + } + ipv6_hdr(skb)->hop_limit--; + + skb_pull(skb, sizeof(struct ipv6hdr)); + goto looped_back; + } + + dst_input(skb); + + return -1; +} + /******************************** Routing header. ********************************/ @@ -506,9 +696,16 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) return -1; } - /* segment routing */ - if (hdr->type == IPV6_SRCRT_TYPE_4) + switch (hdr->type) { + case IPV6_SRCRT_TYPE_4: + /* segment routing */ return ipv6_srh_rcv(skb); + case IPV6_SRCRT_TYPE_3: + /* rpl segment routing */ + return ipv6_rpl_srh_rcv(skb); + default: + break; + } looped_back: if (hdr->segments_left == 0) { diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 422dcc691f71..8c1ce78956ba 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -125,7 +125,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, }, }; -static int ila_build_state(struct nlattr *nla, +static int ila_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 524006aa0d78..cc6180e08a4f 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -311,7 +311,7 @@ static int vti6_rcv(struct sk_buff *skb) if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { rcu_read_unlock(); - return 0; + goto discard; } ipv6h = ipv6_hdr(skb); @@ -450,15 +450,33 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) int mtu; if (!dst) { - fl->u.ip6.flowi6_oif = dev->ifindex; - fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; - dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); - if (dst->error) { - dst_release(dst); - dst = NULL; + switch (skb->protocol) { + case htons(ETH_P_IP): { + struct rtable *rt; + + fl->u.ip4.flowi4_oif = dev->ifindex; + fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); + if (IS_ERR(rt)) + goto tx_err_link_failure; + dst = &rt->dst; + skb_dst_set(skb, dst); + break; + } + case htons(ETH_P_IPV6): + fl->u.ip6.flowi6_oif = dev->ifindex; + fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); + if (dst->error) { + dst_release(dst); + dst = NULL; + goto tx_err_link_failure; + } + skb_dst_set(skb, dst); + break; + default: goto tx_err_link_failure; } - skb_dst_set(skb, dst); } dst_hold(dst); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 4a3feccd5b10..6ffa153e5166 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -197,6 +197,7 @@ static inline int ndisc_is_useropt(const struct net_device *dev, return opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || + opt->nd_opt_type == ND_OPT_PREF64 || ndisc_ops_is_useropt(dev, opt->nd_opt_type); } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index c973ace208c5..e27393498ecb 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1227,7 +1227,7 @@ struct compat_ip6t_replace { u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; /* struct xt_counters * */ - struct compat_ip6t_entry entries[0]; + struct compat_ip6t_entry entries[]; }; static int @@ -1571,7 +1571,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, struct compat_ip6t_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; - struct compat_ip6t_entry entrytable[0]; + struct compat_ip6t_entry entrytable[]; }; static int diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2430c2f6819a..310cbddaa533 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1062,8 +1062,6 @@ static unsigned short fib6_info_dst_flags(struct fib6_info *rt) flags |= DST_NOCOUNT; if (rt->dst_nopolicy) flags |= DST_NOPOLICY; - if (rt->dst_host) - flags |= DST_HOST; return flags; } @@ -1349,7 +1347,6 @@ static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, ip6_rt_copy_init(rt, res); rt->rt6i_flags |= RTF_CACHE; - rt->dst.flags |= DST_HOST; rt->rt6i_dst.addr = *daddr; rt->rt6i_dst.plen = 128; @@ -3142,7 +3139,6 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, goto out; } - rt->dst.flags |= DST_HOST; rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_gateway = fl6->daddr; @@ -3475,7 +3471,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, !netif_carrier_ok(dev)) fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; - err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap, + err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap, cfg->fc_encap_type, cfg, gfp_flags, extack); if (err) goto out; @@ -3645,8 +3641,6 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->fib6_dst.plen = cfg->fc_dst_len; - if (rt->fib6_dst.plen == 128) - rt->dst_host = true; #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c new file mode 100644 index 000000000000..dc4f20e23bf7 --- /dev/null +++ b/net/ipv6/rpl.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-only +/** + * Authors: + * (C) 2020 Alexander Aring <alex.aring@gmail.com> + */ + +#include <net/ipv6.h> +#include <net/rpl.h> + +#define IPV6_PFXTAIL_LEN(x) (sizeof(struct in6_addr) - (x)) + +static void ipv6_rpl_addr_decompress(struct in6_addr *dst, + const struct in6_addr *daddr, + const void *post, unsigned char pfx) +{ + memcpy(dst, daddr, pfx); + memcpy(&dst->s6_addr[pfx], post, IPV6_PFXTAIL_LEN(pfx)); +} + +static void ipv6_rpl_addr_compress(void *dst, const struct in6_addr *addr, + unsigned char pfx) +{ + memcpy(dst, &addr->s6_addr[pfx], IPV6_PFXTAIL_LEN(pfx)); +} + +static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i) +{ + return (void *)&hdr->rpl_segdata[i * IPV6_PFXTAIL_LEN(hdr->cmpri)]; +} + +size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri, + unsigned char cmpre) +{ + return (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre); +} + +void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr, + const struct ipv6_rpl_sr_hdr *inhdr, + const struct in6_addr *daddr, unsigned char n) +{ + int i; + + outhdr->nexthdr = inhdr->nexthdr; + outhdr->hdrlen = (((n + 1) * sizeof(struct in6_addr)) >> 3); + outhdr->pad = 0; + outhdr->type = inhdr->type; + outhdr->segments_left = inhdr->segments_left; + outhdr->cmpri = 0; + outhdr->cmpre = 0; + + for (i = 0; i <= n; i++) + ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[i], daddr, + ipv6_rpl_segdata_pos(inhdr, i), + inhdr->cmpri); + + ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[n], daddr, + ipv6_rpl_segdata_pos(inhdr, n), + inhdr->cmpre); +} + +static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr, + const struct in6_addr *daddr, + unsigned char n) +{ + unsigned char plen; + int i; + + for (plen = 0; plen < sizeof(*daddr); plen++) { + for (i = 0; i <= n; i++) { + if (daddr->s6_addr[plen] != + inhdr->rpl_segaddr[i].s6_addr[plen]) + return plen; + } + } + + return plen; +} + +static unsigned char ipv6_rpl_srh_calc_cmpre(const struct in6_addr *daddr, + const struct in6_addr *last_segment) +{ + unsigned int plen; + + for (plen = 0; plen < sizeof(*daddr); plen++) { + if (daddr->s6_addr[plen] != last_segment->s6_addr[plen]) + break; + } + + return plen; +} + +void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr, + const struct ipv6_rpl_sr_hdr *inhdr, + const struct in6_addr *daddr, unsigned char n) +{ + unsigned char cmpri, cmpre; + size_t seglen; + int i; + + cmpri = ipv6_rpl_srh_calc_cmpri(inhdr, daddr, n); + cmpre = ipv6_rpl_srh_calc_cmpre(daddr, &inhdr->rpl_segaddr[n]); + + outhdr->nexthdr = inhdr->nexthdr; + seglen = (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre); + outhdr->hdrlen = seglen >> 3; + if (seglen & 0x7) { + outhdr->hdrlen++; + outhdr->pad = 8 - (seglen & 0x7); + } else { + outhdr->pad = 0; + } + outhdr->type = inhdr->type; + outhdr->segments_left = inhdr->segments_left; + outhdr->cmpri = cmpri; + outhdr->cmpre = cmpre; + + for (i = 0; i <= n; i++) + ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, i), + &inhdr->rpl_segaddr[i], cmpri); + + ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, n), + &inhdr->rpl_segaddr[n], cmpre); +} diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c new file mode 100644 index 000000000000..203037afe001 --- /dev/null +++ b/net/ipv6/rpl_iptunnel.c @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: GPL-2.0-only +/** + * Authors: + * (C) 2020 Alexander Aring <alex.aring@gmail.com> + */ + +#include <linux/rpl_iptunnel.h> + +#include <net/dst_cache.h> +#include <net/ip6_route.h> +#include <net/lwtunnel.h> +#include <net/ipv6.h> +#include <net/rpl.h> + +struct rpl_iptunnel_encap { + struct ipv6_rpl_sr_hdr srh[0]; +}; + +struct rpl_lwt { + struct dst_cache cache; + struct rpl_iptunnel_encap tuninfo; +}; + +static inline struct rpl_lwt *rpl_lwt_lwtunnel(struct lwtunnel_state *lwt) +{ + return (struct rpl_lwt *)lwt->data; +} + +static inline struct rpl_iptunnel_encap * +rpl_encap_lwtunnel(struct lwtunnel_state *lwt) +{ + return &rpl_lwt_lwtunnel(lwt)->tuninfo; +} + +static const struct nla_policy rpl_iptunnel_policy[RPL_IPTUNNEL_MAX + 1] = { + [RPL_IPTUNNEL_SRH] = { .type = NLA_BINARY }, +}; + +static bool rpl_validate_srh(struct net *net, struct ipv6_rpl_sr_hdr *srh, + size_t seglen) +{ + int err; + + if ((srh->hdrlen << 3) != seglen) + return false; + + /* check at least one segment and seglen fit with segments_left */ + if (!srh->segments_left || + (srh->segments_left * sizeof(struct in6_addr)) != seglen) + return false; + + if (srh->cmpri || srh->cmpre) + return false; + + err = ipv6_chk_rpl_srh_loop(net, srh->rpl_segaddr, + srh->segments_left); + if (err) + return false; + + if (ipv6_addr_type(&srh->rpl_segaddr[srh->segments_left - 1]) & + IPV6_ADDR_MULTICAST) + return false; + + return true; +} + +static int rpl_build_state(struct net *net, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RPL_IPTUNNEL_MAX + 1]; + struct lwtunnel_state *newts; + struct ipv6_rpl_sr_hdr *srh; + struct rpl_lwt *rlwt; + int err, srh_len; + + if (family != AF_INET6) + return -EINVAL; + + err = nla_parse_nested(tb, RPL_IPTUNNEL_MAX, nla, + rpl_iptunnel_policy, extack); + if (err < 0) + return err; + + if (!tb[RPL_IPTUNNEL_SRH]) + return -EINVAL; + + srh = nla_data(tb[RPL_IPTUNNEL_SRH]); + srh_len = nla_len(tb[RPL_IPTUNNEL_SRH]); + + if (srh_len < sizeof(*srh)) + return -EINVAL; + + /* verify that SRH is consistent */ + if (!rpl_validate_srh(net, srh, srh_len - sizeof(*srh))) + return -EINVAL; + + newts = lwtunnel_state_alloc(srh_len + sizeof(*rlwt)); + if (!newts) + return -ENOMEM; + + rlwt = rpl_lwt_lwtunnel(newts); + + err = dst_cache_init(&rlwt->cache, GFP_ATOMIC); + if (err) { + kfree(newts); + return err; + } + + memcpy(&rlwt->tuninfo.srh, srh, srh_len); + + newts->type = LWTUNNEL_ENCAP_RPL; + newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + + *ts = newts; + + return 0; +} + +static void rpl_destroy_state(struct lwtunnel_state *lwt) +{ + dst_cache_destroy(&rpl_lwt_lwtunnel(lwt)->cache); +} + +static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, + const struct ipv6_rpl_sr_hdr *srh) +{ + struct ipv6_rpl_sr_hdr *isrh, *csrh; + const struct ipv6hdr *oldhdr; + struct ipv6hdr *hdr; + unsigned char *buf; + size_t hdrlen; + int err; + + oldhdr = ipv6_hdr(skb); + + buf = kzalloc(ipv6_rpl_srh_alloc_size(srh->segments_left - 1) * 2, + GFP_ATOMIC); + if (!buf) + return -ENOMEM; + + isrh = (struct ipv6_rpl_sr_hdr *)buf; + csrh = (struct ipv6_rpl_sr_hdr *)(buf + ((srh->hdrlen + 1) << 3)); + + memcpy(isrh, srh, sizeof(*isrh)); + memcpy(isrh->rpl_segaddr, &srh->rpl_segaddr[1], + (srh->segments_left - 1) * 16); + isrh->rpl_segaddr[srh->segments_left - 1] = oldhdr->daddr; + + ipv6_rpl_srh_compress(csrh, isrh, &srh->rpl_segaddr[0], + isrh->segments_left - 1); + + hdrlen = ((csrh->hdrlen + 1) << 3); + + err = skb_cow_head(skb, hdrlen + skb->mac_len); + if (unlikely(err)) + return err; + + skb_pull(skb, sizeof(struct ipv6hdr)); + skb_postpull_rcsum(skb, skb_network_header(skb), + sizeof(struct ipv6hdr)); + + skb_push(skb, sizeof(struct ipv6hdr) + hdrlen); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + + hdr = ipv6_hdr(skb); + memmove(hdr, oldhdr, sizeof(*hdr)); + isrh = (void *)hdr + sizeof(*hdr); + memcpy(isrh, csrh, hdrlen); + + isrh->nexthdr = hdr->nexthdr; + hdr->nexthdr = NEXTHDR_ROUTING; + hdr->daddr = srh->rpl_segaddr[0]; + + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + + skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen); + + kfree(buf); + + return 0; +} + +static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) +{ + struct dst_entry *dst = skb_dst(skb); + struct rpl_iptunnel_encap *tinfo; + int err = 0; + + if (skb->protocol != htons(ETH_P_IPV6)) + return -EINVAL; + + tinfo = rpl_encap_lwtunnel(dst->lwtstate); + + err = rpl_do_srh_inline(skb, rlwt, tinfo->srh); + if (err) + return err; + + return 0; +} + +static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *orig_dst = skb_dst(skb); + struct dst_entry *dst = NULL; + struct rpl_lwt *rlwt; + int err = -EINVAL; + + rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); + + err = rpl_do_srh(skb, rlwt); + if (unlikely(err)) + goto drop; + + preempt_disable(); + dst = dst_cache_get(&rlwt->cache); + preempt_enable(); + + if (unlikely(!dst)) { + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.daddr = hdr->daddr; + fl6.saddr = hdr->saddr; + fl6.flowlabel = ip6_flowinfo(hdr); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = hdr->nexthdr; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + err = dst->error; + dst_release(dst); + goto drop; + } + + preempt_disable(); + dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); + preempt_enable(); + } + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; + + return dst_output(net, sk, skb); + +drop: + kfree_skb(skb); + return err; +} + +static int rpl_input(struct sk_buff *skb) +{ + struct dst_entry *orig_dst = skb_dst(skb); + struct dst_entry *dst = NULL; + struct rpl_lwt *rlwt; + int err; + + rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); + + err = rpl_do_srh(skb, rlwt); + if (unlikely(err)) { + kfree_skb(skb); + return err; + } + + preempt_disable(); + dst = dst_cache_get(&rlwt->cache); + preempt_enable(); + + skb_dst_drop(skb); + + if (!dst) { + ip6_route_input(skb); + dst = skb_dst(skb); + if (!dst->error) { + preempt_disable(); + dst_cache_set_ip6(&rlwt->cache, dst, + &ipv6_hdr(skb)->saddr); + preempt_enable(); + } + } else { + skb_dst_set(skb, dst); + } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + return err; + + return dst_input(skb); +} + +static int nla_put_rpl_srh(struct sk_buff *skb, int attrtype, + struct rpl_iptunnel_encap *tuninfo) +{ + struct rpl_iptunnel_encap *data; + struct nlattr *nla; + int len; + + len = RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh); + + nla = nla_reserve(skb, attrtype, len); + if (!nla) + return -EMSGSIZE; + + data = nla_data(nla); + memcpy(data, tuninfo->srh, len); + + return 0; +} + +static int rpl_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate); + + if (nla_put_rpl_srh(skb, RPL_IPTUNNEL_SRH, tuninfo)) + return -EMSGSIZE; + + return 0; +} + +static int rpl_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate); + + return nla_total_size(RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh)); +} + +static int rpl_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct rpl_iptunnel_encap *a_hdr = rpl_encap_lwtunnel(a); + struct rpl_iptunnel_encap *b_hdr = rpl_encap_lwtunnel(b); + int len = RPL_IPTUNNEL_SRH_SIZE(a_hdr->srh); + + if (len != RPL_IPTUNNEL_SRH_SIZE(b_hdr->srh)) + return 1; + + return memcmp(a_hdr, b_hdr, len); +} + +static const struct lwtunnel_encap_ops rpl_ops = { + .build_state = rpl_build_state, + .destroy_state = rpl_destroy_state, + .output = rpl_output, + .input = rpl_input, + .fill_encap = rpl_fill_encap_info, + .get_encap_size = rpl_encap_nlsize, + .cmp_encap = rpl_encap_cmp, + .owner = THIS_MODULE, +}; + +int __init rpl_init(void) +{ + int err; + + err = lwtunnel_encap_add_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL); + if (err) + goto out; + + pr_info("RPL Segment Routing with IPv6\n"); + + return 0; + +out: + return err; +} + +void rpl_exit(void) +{ + lwtunnel_encap_del_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL); +} diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index ac837afb9040..c7cbfeae94f5 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -376,7 +376,7 @@ drop: return err; } -static int seg6_build_state(struct nlattr *nla, +static int seg6_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 8165802d8e05..52493423f329 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -970,8 +970,9 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) return 0; } -static int seg6_local_build_state(struct nlattr *nla, unsigned int family, - const void *cfg, struct lwtunnel_state **ts, +static int seg6_local_build_state(struct net *net, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[SEG6_LOCAL_MAX + 1]; diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index e11bdb0aaa15..25b7ebda2fab 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -78,7 +78,7 @@ static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const hlist_for_each_entry_rcu(x6spi, &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], - list_byaddr) { + list_byaddr, lockdep_is_held(&xfrm6_tunnel_spi_lock)) { if (xfrm6_addr_equal(&x6spi->addr, saddr)) return x6spi; } diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 7b654d2b8bb2..0f72813fed53 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -5,8 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #include <linux/ieee80211.h> @@ -1012,8 +1011,15 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, prev_beacon_int = sdata->vif.bss_conf.beacon_int; sdata->vif.bss_conf.beacon_int = params->beacon_interval; - if (params->he_cap) + if (params->he_cap && params->he_oper) { sdata->vif.bss_conf.he_support = true; + sdata->vif.bss_conf.htc_trig_based_pkt_ext = + le32_get_bits(params->he_oper->he_oper_params, + IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK); + sdata->vif.bss_conf.frame_time_rts_th = + le32_get_bits(params->he_oper->he_oper_params, + IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK); + } mutex_lock(&local->mtx); err = ieee80211_vif_use_channel(sdata, ¶ms->chandef, @@ -1034,6 +1040,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->control_port_no_encrypt = params->crypto.control_port_no_encrypt; sdata->control_port_over_nl80211 = params->crypto.control_port_over_nl80211; + sdata->control_port_no_preauth = + params->crypto.control_port_no_preauth; sdata->encrypt_headroom = ieee80211_cs_headroom(sdata->local, ¶ms->crypto, sdata->vif.type); @@ -1045,6 +1053,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, params->crypto.control_port_no_encrypt; vlan->control_port_over_nl80211 = params->crypto.control_port_over_nl80211; + vlan->control_port_no_preauth = + params->crypto.control_port_no_preauth; vlan->encrypt_headroom = ieee80211_cs_headroom(sdata->local, ¶ms->crypto, diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 266d63819415..829dcad69c2c 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -5,7 +5,7 @@ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #include <linux/debugfs.h> @@ -78,6 +78,7 @@ static const char * const sta_flag_names[] = { FLAG(MPSP_OWNER), FLAG(MPSP_RECIPIENT), FLAG(PS_DELIVER), + FLAG(USES_ENCRYPTION), #undef FLAG }; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index de39f9ca9935..f8ed4f621f7f 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -912,6 +912,7 @@ struct ieee80211_sub_if_data { u16 sequence_number; __be16 control_port_protocol; bool control_port_no_encrypt; + bool control_port_no_preauth; bool control_port_over_nl80211; int encrypt_headroom; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 128b3468d13e..d069825705d6 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -519,6 +519,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) master->control_port_no_encrypt; sdata->control_port_over_nl80211 = master->control_port_over_nl80211; + sdata->control_port_no_preauth = + master->control_port_no_preauth; sdata->vif.cab_queue = master->vif.cab_queue; memcpy(sdata->vif.hw_queue, master->vif.hw_queue, sizeof(sdata->vif.hw_queue)); @@ -1463,6 +1465,8 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, sdata->control_port_protocol = cpu_to_be16(ETH_P_PAE); sdata->control_port_no_encrypt = false; + sdata->control_port_over_nl80211 = false; + sdata->control_port_no_preauth = false; sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM; sdata->vif.bss_conf.idle = true; sdata->vif.bss_conf.txpower = INT_MIN; /* unset */ diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 6354491c5a09..8f403c1bb908 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -6,7 +6,7 @@ * Copyright 2007-2008 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright 2018-2019 Intel Corporation + * Copyright 2018-2020 Intel Corporation */ #include <linux/if_ether.h> @@ -277,22 +277,29 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) sta ? sta->sta.addr : bcast_addr, ret); } -int ieee80211_set_tx_key(struct ieee80211_key *key) +static int _ieee80211_set_tx_key(struct ieee80211_key *key, bool force) { struct sta_info *sta = key->sta; struct ieee80211_local *local = key->local; assert_key_lock(local); + set_sta_flag(sta, WLAN_STA_USES_ENCRYPTION); + sta->ptk_idx = key->conf.keyidx; - if (!ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT)) + if (force || !ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT)) clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_check_fast_xmit(sta); return 0; } +int ieee80211_set_tx_key(struct ieee80211_key *key) +{ + return _ieee80211_set_tx_key(key, false); +} + static void ieee80211_pairwise_rekey(struct ieee80211_key *old, struct ieee80211_key *new) { @@ -481,11 +488,8 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, if (pairwise) { rcu_assign_pointer(sta->ptk[idx], new); if (new && - !(new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX)) { - sta->ptk_idx = idx; - clear_sta_flag(sta, WLAN_STA_BLOCK_BA); - ieee80211_check_fast_xmit(sta); - } + !(new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX)) + _ieee80211_set_tx_key(new, true); } else { rcu_assign_pointer(sta->gtk[idx], new); } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 944e86da5c65..8345926193de 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -576,7 +576,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, WIPHY_FLAG_REPORTS_OBSS | WIPHY_FLAG_OFFCHAN_TX; - if (ops->remain_on_channel) + if (!use_chanctx || ops->remain_on_channel) wiphy->flags |= WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL; wiphy->features |= NL80211_FEATURE_SK_TX_STATUS | @@ -589,6 +589,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_FILS_STA); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211); + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH); if (!ops->hw_scan) { wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | @@ -1081,6 +1083,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) NL80211_EXT_FEATURE_EXT_KEY_ID); } + if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_ADHOC)) + wiphy_ext_feature_set(local->hw.wiphy, + NL80211_EXT_FEATURE_DEL_IBSS_STA); + /* * Calculate scan IE length -- we need this to alloc * memory and to subtract from the driver limit. It diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 8a2f2fa21916..16d75da0996a 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5458,6 +5458,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt; sdata->control_port_over_nl80211 = req->crypto.control_port_over_nl80211; + sdata->control_port_no_preauth = req->crypto.control_port_no_preauth; sdata->encrypt_headroom = ieee80211_cs_headroom(local, &req->crypto, sdata->vif.type); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 2ffb4ee467e1..91a13aee4378 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2497,7 +2497,8 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb, struct net_device *dev = sdata->dev; if (unlikely((skb->protocol == sdata->control_port_protocol || - skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) && + (skb->protocol == cpu_to_be16(ETH_P_PREAUTH) && + !sdata->control_port_no_preauth)) && sdata->control_port_over_nl80211)) { struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); bool noencrypt = !(status->flag & RX_FLAG_DECRYPTED); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index f357156b86ee..f8d5c2515829 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1049,6 +1049,11 @@ static void __sta_info_destroy_part2(struct sta_info *sta) might_sleep(); lockdep_assert_held(&local->sta_mtx); + while (sta->sta_state == IEEE80211_STA_AUTHORIZED) { + ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); + WARN_ON_ONCE(ret); + } + /* now keys can no longer be reached */ ieee80211_free_sta_keys(local, sta); @@ -2150,19 +2155,41 @@ static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) return 0; } +static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, + int tid) +{ + unsigned int start; + u64 value; + + do { + start = u64_stats_fetch_begin(&rxstats->syncp); + value = rxstats->msdu[tid]; + } while (u64_stats_fetch_retry(&rxstats->syncp, start)); + + return value; +} + static void sta_set_tidstats(struct sta_info *sta, struct cfg80211_tid_stats *tidstats, int tid) { struct ieee80211_local *local = sta->local; + int cpu; if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { - unsigned int start; + if (!ieee80211_hw_check(&local->hw, USES_RSS)) + tidstats->rx_msdu += + sta_get_tidstats_msdu(&sta->rx_stats, tid); + + if (sta->pcpu_rx_stats) { + for_each_possible_cpu(cpu) { + struct ieee80211_sta_rx_stats *cpurxs; - do { - start = u64_stats_fetch_begin(&sta->rx_stats.syncp); - tidstats->rx_msdu = sta->rx_stats.msdu[tid]; - } while (u64_stats_fetch_retry(&sta->rx_stats.syncp, start)); + cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu); + tidstats->rx_msdu += + sta_get_tidstats_msdu(cpurxs, tid); + } + } tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU); } @@ -2266,7 +2293,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { - sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats); + if (!ieee80211_hw_check(&local->hw, USES_RSS)) + sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats); if (sta->pcpu_rx_stats) { for_each_possible_cpu(cpu) { diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 364a35414d05..36f1abaab9ff 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -98,6 +98,7 @@ enum ieee80211_sta_info_flags { WLAN_STA_MPSP_OWNER, WLAN_STA_MPSP_RECIPIENT, WLAN_STA_PS_DELIVER, + WLAN_STA_USES_ENCRYPTION, NUM_WLAN_STA_FLAGS, }; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 83147385c200..b65284f210c3 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -590,10 +590,13 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; - if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)) + if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)) { tx->key = NULL; - else if (tx->sta && - (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx]))) + return TX_CONTINUE; + } + + if (tx->sta && + (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx]))) tx->key = key; else if (ieee80211_is_group_privacy_action(tx->skb) && (key = rcu_dereference(tx->sdata->default_multicast_key))) @@ -654,6 +657,9 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) if (!skip_hw && tx->key && tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) info->control.hw_key = &tx->key->conf; + } else if (!ieee80211_is_mgmt(hdr->frame_control) && tx->sta && + test_sta_flag(tx->sta, WLAN_STA_USES_ENCRYPTION)) { + return TX_DROP; } return TX_CONTINUE; @@ -3599,8 +3605,25 @@ begin: tx.skb = skb; tx.sdata = vif_to_sdata(info->control.vif); - if (txq->sta) + if (txq->sta) { tx.sta = container_of(txq->sta, struct sta_info, sta); + /* + * Drop unicast frames to unauthorised stations unless they are + * EAPOL frames from the local station. + */ + if (unlikely(!ieee80211_vif_is_mesh(&tx.sdata->vif) && + tx.sdata->vif.type != NL80211_IFTYPE_OCB && + !is_multicast_ether_addr(hdr->addr1) && + !test_sta_flag(tx.sta, WLAN_STA_AUTHORIZED) && + (!(info->control.flags & + IEEE80211_TX_CTRL_PORT_CTRL_PROTO) || + !ether_addr_equal(tx.sdata->vif.addr, + hdr->addr2)))) { + I802_DEBUG_INC(local->tx_handlers_drop_unauth_port); + ieee80211_free_txskb(&local->hw, skb); + goto begin; + } + } /* * The key can be removed while the packet was queued, so need to call @@ -4670,6 +4693,7 @@ static int ieee80211_beacon_protect(struct sk_buff *skb, { ieee80211_tx_result res; struct ieee80211_tx_data tx; + struct sk_buff *check_skb; memset(&tx, 0, sizeof(tx)); tx.key = rcu_dereference(sdata->default_beacon_key); @@ -4680,8 +4704,11 @@ static int ieee80211_beacon_protect(struct sk_buff *skb, __skb_queue_head_init(&tx.skbs); __skb_queue_tail(&tx.skbs, skb); res = ieee80211_tx_h_encrypt(&tx); + check_skb = __skb_dequeue(&tx.skbs); + /* we may crash after this, but it'd be a bug in crypto */ + WARN_ON(check_skb != skb); if (WARN_ON_ONCE(res != TX_CONTINUE)) - return -1; + return -EINVAL; return 0; } @@ -5314,6 +5341,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ethhdr *ehdr; + u32 ctrl_flags = 0; u32 flags; /* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE @@ -5323,6 +5351,9 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, proto != cpu_to_be16(ETH_P_PREAUTH)) return -EINVAL; + if (proto == sdata->control_port_protocol) + ctrl_flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO; + if (unencrypted) flags = IEEE80211_TX_INTFL_DONT_ENCRYPT; else @@ -5348,7 +5379,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, skb_reset_mac_header(skb); local_bh_disable(); - __ieee80211_subif_start_xmit(skb, skb->dev, flags, 0); + __ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags); local_bh_enable(); return 0; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index ea7d277a8c45..20436c86b9bf 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -912,8 +912,12 @@ static void ieee80211_parse_extension_element(u32 *crc, break; case WLAN_EID_EXT_HE_OPERATION: if (len >= sizeof(*elems->he_operation) && - len == ieee80211_he_oper_size(data) - 1) + len == ieee80211_he_oper_size(data) - 1) { + if (crc) + *crc = crc32_be(*crc, (void *)elem, + elem->datalen + 2); elems->he_operation = data; + } break; case WLAN_EID_EXT_UORA: if (len == 1) diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 44b675016393..2def85718d94 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -162,7 +162,7 @@ drop: return -EINVAL; } -static int mpls_build_state(struct nlattr *nla, +static int mpls_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 4e98d9edfd0a..baa0640527c7 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_MPTCP) += mptcp.o -mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o +mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ + mib.o pm_netlink.o diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index 40d1bb18fd60..c151628bd416 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -44,8 +44,7 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6])); } -void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, - void *hmac) +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) { u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; @@ -55,6 +54,9 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, u8 key2be[8]; int i; + if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE)) + len = SHA256_DIGEST_SIZE; + put_unaligned_be64(key1, key1be); put_unaligned_be64(key2, key2be); @@ -65,11 +67,10 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, for (i = 0; i < 8; i++) input[i + 8] ^= key2be[i]; - put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]); - put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]); + memcpy(&input[SHA256_BLOCK_SIZE], msg, len); sha256_init(&state); - sha256_update(&state, input, SHA256_BLOCK_SIZE + 8); + sha256_update(&state, input, SHA256_BLOCK_SIZE + len); /* emit sha256(K1 || msg) on the second input block, so we can * reuse 'input' for the last hashing @@ -125,6 +126,7 @@ static int __init test_mptcp_crypto(void) char hmac[20], hmac_hex[41]; u32 nonce1, nonce2; u64 key1, key2; + u8 msg[8]; int i, j; for (i = 0; i < ARRAY_SIZE(tests); ++i) { @@ -134,7 +136,10 @@ static int __init test_mptcp_crypto(void) nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); - mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac); + put_unaligned_be32(nonce1, &msg[0]); + put_unaligned_be32(nonce2, &msg[4]); + + mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); for (j = 0; j < 20; ++j) sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); hmac_hex[40] = 0; diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c new file mode 100644 index 000000000000..a536586742f2 --- /dev/null +++ b/net/mptcp/diag.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP socket monitoring support + * + * Copyright (c) 2019 Red Hat + * + * Author: Davide Caratti <dcaratti@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/net.h> +#include <linux/inet_diag.h> +#include <net/netlink.h> +#include <uapi/linux/mptcp.h> +#include "protocol.h" + +static int subflow_get_info(const struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *sf; + struct nlattr *start; + u32 flags = 0; + int err; + + start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP); + if (!start) + return -EMSGSIZE; + + rcu_read_lock(); + sf = rcu_dereference(inet_csk(sk)->icsk_ulp_data); + if (!sf) { + err = 0; + goto nla_failure; + } + + if (sf->mp_capable) + flags |= MPTCP_SUBFLOW_FLAG_MCAP_REM; + if (sf->request_mptcp) + flags |= MPTCP_SUBFLOW_FLAG_MCAP_LOC; + if (sf->mp_join) + flags |= MPTCP_SUBFLOW_FLAG_JOIN_REM; + if (sf->request_join) + flags |= MPTCP_SUBFLOW_FLAG_JOIN_LOC; + if (sf->backup) + flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM; + if (sf->request_bkup) + flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC; + if (sf->fully_established) + flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED; + if (sf->conn_finished) + flags |= MPTCP_SUBFLOW_FLAG_CONNECTED; + if (sf->map_valid) + flags |= MPTCP_SUBFLOW_FLAG_MAPVALID; + + if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, + sf->rel_write_seq) || + nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq, + MPTCP_SUBFLOW_ATTR_PAD) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, + sf->map_subflow_seq) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) || + nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN, + sf->map_data_len) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) || + nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) || + nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, sf->local_id)) { + err = -EMSGSIZE; + goto nla_failure; + } + + rcu_read_unlock(); + nla_nest_end(skb, start); + return 0; + +nla_failure: + rcu_read_unlock(); + nla_nest_cancel(skb, start); + return err; +} + +static size_t subflow_get_info_size(const struct sock *sk) +{ + size_t size = 0; + + size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */ + nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */ + nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ + nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */ + nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */ + nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */ + 0; + return size; +} + +void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops) +{ + ops->get_info = subflow_get_info; + ops->get_info_size = subflow_get_info_size; +} diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c new file mode 100644 index 000000000000..0a6a15f3456d --- /dev/null +++ b/net/mptcp/mib.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/seq_file.h> +#include <net/ip.h> +#include <net/mptcp.h> +#include <net/snmp.h> +#include <net/net_namespace.h> + +#include "mib.h" + +static const struct snmp_mib mptcp_snmp_list[] = { + SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE), + SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), + SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), + SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), + SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), + SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), + SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), + SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX), + SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), + SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), + SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), + SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), + SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), + SNMP_MIB_SENTINEL +}; + +/* mptcp_mib_alloc - allocate percpu mib counters + * + * These are allocated when the first mptcp socket is created so + * we do not waste percpu memory if mptcp isn't in use. + */ +bool mptcp_mib_alloc(struct net *net) +{ + struct mptcp_mib __percpu *mib = alloc_percpu(struct mptcp_mib); + + if (!mib) + return false; + + if (cmpxchg(&net->mib.mptcp_statistics, NULL, mib)) + free_percpu(mib); + + return true; +} + +void mptcp_seq_show(struct seq_file *seq) +{ + struct net *net = seq->private; + int i; + + seq_puts(seq, "MPTcpExt:"); + for (i = 0; mptcp_snmp_list[i].name; i++) + seq_printf(seq, " %s", mptcp_snmp_list[i].name); + + seq_puts(seq, "\nMPTcpExt:"); + + if (!net->mib.mptcp_statistics) { + for (i = 0; mptcp_snmp_list[i].name; i++) + seq_puts(seq, " 0"); + + return; + } + + for (i = 0; mptcp_snmp_list[i].name; i++) + seq_printf(seq, " %lu", + snmp_fold_field(net->mib.mptcp_statistics, + mptcp_snmp_list[i].entry)); + seq_putc(seq, '\n'); +} diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h new file mode 100644 index 000000000000..d7de340fc997 --- /dev/null +++ b/net/mptcp/mib.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +enum linux_mptcp_mib_field { + MPTCP_MIB_NUM = 0, + MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */ + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ + MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ + MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ + MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */ + MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */ + MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ + MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ + MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ + MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ + MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ + __MPTCP_MIB_MAX +}; + +#define LINUX_MIB_MPTCP_MAX __MPTCP_MIB_MAX +struct mptcp_mib { + unsigned long mibs[LINUX_MIB_MPTCP_MAX]; +}; + +static inline void MPTCP_INC_STATS(struct net *net, + enum linux_mptcp_mib_field field) +{ + if (likely(net->mib.mptcp_statistics)) + SNMP_INC_STATS(net->mib.mptcp_statistics, field); +} + +static inline void __MPTCP_INC_STATS(struct net *net, + enum linux_mptcp_mib_field field) +{ + if (likely(net->mib.mptcp_statistics)) + __SNMP_INC_STATS(net->mib.mptcp_statistics, field); +} + +bool mptcp_mib_alloc(struct net *net); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 63c8ee49cef2..bd220ee4aac9 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -96,6 +96,38 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, mp_opt->rcvr_key, mp_opt->data_len); break; + case MPTCPOPT_MP_JOIN: + mp_opt->mp_join = 1; + if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { + mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; + mp_opt->join_id = *ptr++; + mp_opt->token = get_unaligned_be32(ptr); + ptr += 4; + mp_opt->nonce = get_unaligned_be32(ptr); + ptr += 4; + pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u", + mp_opt->backup, mp_opt->join_id, + mp_opt->token, mp_opt->nonce); + } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { + mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; + mp_opt->join_id = *ptr++; + mp_opt->thmac = get_unaligned_be64(ptr); + ptr += 8; + mp_opt->nonce = get_unaligned_be32(ptr); + ptr += 4; + pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u", + mp_opt->backup, mp_opt->join_id, + mp_opt->thmac, mp_opt->nonce); + } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { + ptr += 2; + memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); + pr_debug("MP_JOIN hmac"); + } else { + pr_warn("MP_JOIN bad option size"); + mp_opt->mp_join = 0; + } + break; + case MPTCPOPT_DSS: pr_debug("DSS"); ptr++; @@ -178,6 +210,71 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, break; + case MPTCPOPT_ADD_ADDR: + mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO; + if (!mp_opt->echo) { + if (opsize == TCPOLEN_MPTCP_ADD_ADDR || + opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) + mp_opt->family = MPTCP_ADDR_IPVERSION_4; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || + opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) + mp_opt->family = MPTCP_ADDR_IPVERSION_6; +#endif + else + break; + } else { + if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || + opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) + mp_opt->family = MPTCP_ADDR_IPVERSION_4; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || + opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) + mp_opt->family = MPTCP_ADDR_IPVERSION_6; +#endif + else + break; + } + + mp_opt->add_addr = 1; + mp_opt->port = 0; + mp_opt->addr_id = *ptr++; + pr_debug("ADD_ADDR: id=%d", mp_opt->addr_id); + if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { + memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); + ptr += 4; + if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || + opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { + mp_opt->port = get_unaligned_be16(ptr); + ptr += 2; + } + } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else { + memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16); + ptr += 16; + if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || + opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { + mp_opt->port = get_unaligned_be16(ptr); + ptr += 2; + } + } +#endif + if (!mp_opt->echo) { + mp_opt->ahmac = get_unaligned_be64(ptr); + ptr += 8; + } + break; + + case MPTCPOPT_RM_ADDR: + if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE) + break; + + mp_opt->rm_addr = 1; + mp_opt->rm_id = *ptr++; + pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); + break; + default: break; } @@ -231,6 +328,16 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, opts->sndr_key = subflow->local_key; *size = TCPOLEN_MPTCP_MPC_SYN; return true; + } else if (subflow->request_join) { + pr_debug("remote_token=%u, nonce=%u", subflow->remote_token, + subflow->local_nonce); + opts->suboptions = OPTION_MPTCP_MPJ_SYN; + opts->join_id = subflow->local_id; + opts->token = subflow->remote_token; + opts->nonce = subflow->local_nonce; + opts->backup = subflow->request_bkup; + *size = TCPOLEN_MPTCP_MPJ_SYN; + return true; } return false; } @@ -240,16 +347,55 @@ void mptcp_rcv_synsent(struct sock *sk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct tcp_sock *tp = tcp_sk(sk); - pr_debug("subflow=%p", subflow); if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { subflow->mp_capable = 1; subflow->can_ack = 1; subflow->remote_key = tp->rx_opt.mptcp.sndr_key; - } else { + pr_debug("subflow=%p, remote_key=%llu", subflow, + subflow->remote_key); + } else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) { + subflow->mp_join = 1; + subflow->thmac = tp->rx_opt.mptcp.thmac; + subflow->remote_nonce = tp->rx_opt.mptcp.nonce; + pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, + subflow->thmac, subflow->remote_nonce); + } else if (subflow->request_mptcp) { tcp_sk(sk)->is_mptcp = 0; } } +/* MP_JOIN client subflow must wait for 4th ack before sending any data: + * TCP can't schedule delack timer before the subflow is fully established. + * MPTCP uses the delack timer to do 3rd ack retransmissions + */ +static void schedule_3rdack_retransmission(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + unsigned long timeout; + + /* reschedule with a timeout above RTT, as we must look only for drop */ + if (tp->srtt_us) + timeout = tp->srtt_us << 1; + else + timeout = TCP_TIMEOUT_INIT; + + WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); + icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = timeout; + sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); +} + +static void clear_3rdack_retransmission(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + sk_stop_timer(sk, &icsk->icsk_delack_timer); + icsk->icsk_ack.timeout = 0; + icsk->icsk_ack.ato = 0; + icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); +} + static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, @@ -259,17 +405,21 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, struct mptcp_ext *mpext; unsigned int data_len; - pr_debug("subflow=%p fourth_ack=%d seq=%x:%x remaining=%d", subflow, - subflow->fourth_ack, subflow->snd_isn, - skb ? TCP_SKB_CB(skb)->seq : 0, remaining); + /* When skb is not available, we better over-estimate the emitted + * options len. A full DSS option (28 bytes) is longer than + * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so + * tell the caller to defer the estimate to + * mptcp_established_options_dss(), which will reserve enough space. + */ + if (!skb) + return false; - if (subflow->mp_capable && !subflow->fourth_ack && skb && - subflow->snd_isn == TCP_SKB_CB(skb)->seq) { - /* When skb is not available, we better over-estimate the - * emitted options len. A full DSS option is longer than - * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit - * that. - */ + /* MPC/MPJ needed only on 3rd ack packet */ + if (subflow->fully_established || + subflow->snd_isn != TCP_SKB_CB(skb)->seq) + return false; + + if (subflow->mp_capable) { mpext = mptcp_get_ext(skb); data_len = mpext ? mpext->data_len : 0; @@ -297,6 +447,14 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, data_len); return true; + } else if (subflow->mp_join) { + opts->suboptions = OPTION_MPTCP_MPJ_ACK; + memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN); + *size = TCPOLEN_MPTCP_MPJ_ACK; + pr_debug("subflow=%p", subflow); + + schedule_3rdack_retransmission(sk); + return true; } return false; } @@ -335,7 +493,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, struct mptcp_sock *msk; unsigned int ack_size; bool ret = false; - bool can_ack; u8 tcp_fin; if (skb) { @@ -364,7 +521,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, /* passive sockets msk will set the 'can_ack' after accept(), even * if the first subflow may have the already the remote key handy */ - can_ack = true; opts->ext_copy.use_ack = 0; msk = mptcp_sk(subflow->conn); if (!READ_ONCE(msk->can_ack)) { @@ -388,6 +544,83 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, return true; } +static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id, + struct in_addr *addr) +{ + u8 hmac[MPTCP_ADDR_HMAC_LEN]; + u8 msg[7]; + + msg[0] = addr_id; + memcpy(&msg[1], &addr->s_addr, 4); + msg[5] = 0; + msg[6] = 0; + + mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac); + + return get_unaligned_be64(hmac); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, + struct in6_addr *addr) +{ + u8 hmac[MPTCP_ADDR_HMAC_LEN]; + u8 msg[19]; + + msg[0] = addr_id; + memcpy(&msg[1], &addr->s6_addr, 16); + msg[17] = 0; + msg[18] = 0; + + mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac); + + return get_unaligned_be64(hmac); +} +#endif + +static bool mptcp_established_options_addr(struct sock *sk, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct mptcp_addr_info saddr; + int len; + + if (!mptcp_pm_should_signal(msk) || + !(mptcp_pm_addr_signal(msk, remaining, &saddr))) + return false; + + len = mptcp_add_addr_len(saddr.family); + if (remaining < len) + return false; + + *size = len; + opts->addr_id = saddr.id; + if (saddr.family == AF_INET) { + opts->suboptions |= OPTION_MPTCP_ADD_ADDR; + opts->addr = saddr.addr; + opts->ahmac = add_addr_generate_hmac(msk->local_key, + msk->remote_key, + opts->addr_id, + &opts->addr); + } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (saddr.family == AF_INET6) { + opts->suboptions |= OPTION_MPTCP_ADD_ADDR6; + opts->addr6 = saddr.addr6; + opts->ahmac = add_addr6_generate_hmac(msk->local_key, + msk->remote_key, + opts->addr_id, + &opts->addr6); + } +#endif + pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac); + + return true; +} + bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -395,6 +628,8 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int opt_size = 0; bool ret = false; + opts->suboptions = 0; + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, @@ -409,6 +644,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, *size += opt_size; remaining -= opt_size; + if (mptcp_established_options_addr(sk, &opt_size, remaining, opts)) { + *size += opt_size; + remaining -= opt_size; + ret = true; + } return ret; } @@ -425,54 +665,194 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, pr_debug("subflow_req=%p, local_key=%llu", subflow_req, subflow_req->local_key); return true; + } else if (subflow_req->mp_join) { + opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; + opts->backup = subflow_req->backup; + opts->join_id = subflow_req->local_id; + opts->thmac = subflow_req->thmac; + opts->nonce = subflow_req->local_nonce; + pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u", + subflow_req, opts->backup, opts->join_id, + opts->thmac, opts->nonce); + *size = TCPOLEN_MPTCP_MPJ_SYNACK; + return true; } return false; } -static bool check_fourth_ack(struct mptcp_subflow_context *subflow, - struct sk_buff *skb, - struct mptcp_options_received *mp_opt) +static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, + struct mptcp_subflow_context *subflow, + struct sk_buff *skb, + struct mptcp_options_received *mp_opt) { /* here we can process OoO, in-window pkts, only in-sequence 4th ack - * are relevant + * will make the subflow fully established */ - if (likely(subflow->fourth_ack || - TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)) - return true; + if (likely(subflow->fully_established)) { + /* on passive sockets, check for 3rd ack retransmission + * note that msk is always set by subflow_syn_recv_sock() + * for mp_join subflows + */ + if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && + TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && + subflow->mp_join && mp_opt->mp_join && + READ_ONCE(msk->pm.server_side)) + tcp_send_ack(sk); + goto fully_established; + } - if (mp_opt->use_ack) - subflow->fourth_ack = 1; + /* we should process OoO packets before the first subflow is fully + * established, but not expected for MP_JOIN subflows + */ + if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) + return subflow->mp_capable; - if (subflow->can_ack) - return true; + if (mp_opt->use_ack) { + /* subflows are fully established as soon as we get any + * additional ack. + */ + subflow->fully_established = 1; + goto fully_established; + } + + WARN_ON_ONCE(subflow->can_ack); /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP */ if (!mp_opt->mp_capable) { subflow->mp_capable = 0; - tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0; + tcp_sk(sk)->is_mptcp = 0; return false; } + + subflow->fully_established = 1; subflow->remote_key = mp_opt->sndr_key; subflow->can_ack = 1; + +fully_established: + if (likely(subflow->pm_notified)) + return true; + + subflow->pm_notified = 1; + if (subflow->mp_join) { + clear_3rdack_retransmission(sk); + mptcp_pm_subflow_established(msk, subflow); + } else { + mptcp_pm_fully_established(msk); + } return true; } +static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) +{ + u32 old_ack32, cur_ack32; + + if (use_64bit) + return cur_ack; + + old_ack32 = (u32)old_ack; + cur_ack32 = (u32)cur_ack; + cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32; + if (unlikely(before(cur_ack32, old_ack32))) + return cur_ack + (1LL << 32); + return cur_ack; +} + +static void update_una(struct mptcp_sock *msk, + struct mptcp_options_received *mp_opt) +{ + u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); + u64 write_seq = READ_ONCE(msk->write_seq); + + /* avoid ack expansion on update conflict, to reduce the risk of + * wrongly expanding to a future ack sequence number, which is way + * more dangerous than missing an ack + */ + new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); + + /* ACK for data not even sent yet? Ignore. */ + if (after64(new_snd_una, write_seq)) + new_snd_una = old_snd_una; + + while (after64(new_snd_una, old_snd_una)) { + snd_una = old_snd_una; + old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, + new_snd_una); + if (old_snd_una == snd_una) { + mptcp_data_acked((struct sock *)msk); + break; + } + } +} + +static bool add_addr_hmac_valid(struct mptcp_sock *msk, + struct mptcp_options_received *mp_opt) +{ + u64 hmac = 0; + + if (mp_opt->echo) + return true; + + if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) + hmac = add_addr_generate_hmac(msk->remote_key, + msk->local_key, + mp_opt->addr_id, &mp_opt->addr); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else + hmac = add_addr6_generate_hmac(msk->remote_key, + msk->local_key, + mp_opt->addr_id, &mp_opt->addr6); +#endif + + pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", + msk, (unsigned long long)hmac, + (unsigned long long)mp_opt->ahmac); + + return hmac == mp_opt->ahmac; +} + void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, struct tcp_options_received *opt_rx) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_options_received *mp_opt; struct mptcp_ext *mpext; mp_opt = &opt_rx->mptcp; - if (!check_fourth_ack(subflow, skb, mp_opt)) + if (!check_fully_established(msk, sk, subflow, skb, mp_opt)) return; + if (mp_opt->add_addr && add_addr_hmac_valid(msk, mp_opt)) { + struct mptcp_addr_info addr; + + addr.port = htons(mp_opt->port); + addr.id = mp_opt->addr_id; + if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { + addr.family = AF_INET; + addr.addr = mp_opt->addr; + } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (mp_opt->family == MPTCP_ADDR_IPVERSION_6) { + addr.family = AF_INET6; + addr.addr6 = mp_opt->addr6; + } +#endif + if (!mp_opt->echo) + mptcp_pm_add_addr_received(msk, &addr); + mp_opt->add_addr = 0; + } + if (!mp_opt->dss) return; + /* we can't wait for recvmsg() to update the ack_seq, otherwise + * monodirectional flows will stuck + */ + if (mp_opt->use_ack) + update_una(msk, mp_opt); + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) return; @@ -499,12 +879,6 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, mpext->use_map = 1; } - if (mp_opt->use_ack) { - mpext->data_ack = mp_opt->data_ack; - mpext->use_ack = 1; - mpext->ack64 = mp_opt->ack64; - } - mpext->data_fin = mp_opt->data_fin; } @@ -523,10 +897,9 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) else len = TCPOLEN_MPTCP_MPC_ACK; - *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) | - (MPTCPOPT_MP_CAPABLE << 12) | - (MPTCP_SUPPORTED_VERSION << 8) | - MPTCP_CAP_HMAC_SHA256); + *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, + MPTCP_SUPPORTED_VERSION, + MPTCP_CAP_HMAC_SHA256); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) @@ -548,6 +921,77 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) } mp_capable_done: + if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { + if (opts->ahmac) + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + TCPOLEN_MPTCP_ADD_ADDR, 0, + opts->addr_id); + else + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + TCPOLEN_MPTCP_ADD_ADDR_BASE, + MPTCP_ADDR_ECHO, + opts->addr_id); + memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); + ptr += 1; + if (opts->ahmac) { + put_unaligned_be64(opts->ahmac, ptr); + ptr += 2; + } + } + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { + if (opts->ahmac) + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + TCPOLEN_MPTCP_ADD_ADDR6, 0, + opts->addr_id); + else + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + TCPOLEN_MPTCP_ADD_ADDR6_BASE, + MPTCP_ADDR_ECHO, + opts->addr_id); + memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); + ptr += 4; + if (opts->ahmac) { + put_unaligned_be64(opts->ahmac, ptr); + ptr += 2; + } + } +#endif + + if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, + TCPOLEN_MPTCP_RM_ADDR_BASE, + 0, opts->rm_id); + } + + if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_SYN, + opts->backup, opts->join_id); + put_unaligned_be32(opts->token, ptr); + ptr += 1; + put_unaligned_be32(opts->nonce, ptr); + ptr += 1; + } + + if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_SYNACK, + opts->backup, opts->join_id); + put_unaligned_be64(opts->thmac, ptr); + ptr += 2; + put_unaligned_be32(opts->nonce, ptr); + ptr += 1; + } + + if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_ACK, 0, 0); + memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); + ptr += 5; + } + if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; @@ -569,10 +1013,7 @@ mp_capable_done: flags |= MPTCP_DSS_DATA_FIN; } - *ptr++ = htonl((TCPOPT_MPTCP << 24) | - (len << 16) | - (MPTCPOPT_DSS << 12) | - (flags)); + *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); if (mpext->use_ack) { put_unaligned_be64(mpext->data_ack, ptr); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c new file mode 100644 index 000000000000..064639f72487 --- /dev/null +++ b/net/mptcp/pm.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2019, Intel Corporation. + */ +#include <linux/kernel.h> +#include <net/tcp.h> +#include <net/mptcp.h> +#include "protocol.h" + +static struct workqueue_struct *pm_wq; + +/* path manager command handlers */ + +int mptcp_pm_announce_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + pr_debug("msk=%p, local_id=%d", msk, addr->id); + + msk->pm.local = *addr; + WRITE_ONCE(msk->pm.addr_signal, true); + return 0; +} + +int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id) +{ + return -ENOTSUPP; +} + +int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id) +{ + return -ENOTSUPP; +} + +/* path manager event handlers */ + +void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side) +{ + struct mptcp_pm_data *pm = &msk->pm; + + pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side); + + WRITE_ONCE(pm->server_side, server_side); +} + +bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + int ret; + + pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows, + pm->subflows_max, READ_ONCE(pm->accept_subflow)); + + /* try to avoid acquiring the lock below */ + if (!READ_ONCE(pm->accept_subflow)) + return false; + + spin_lock_bh(&pm->lock); + ret = pm->subflows < pm->subflows_max; + if (ret && ++pm->subflows == pm->subflows_max) + WRITE_ONCE(pm->accept_subflow, false); + spin_unlock_bh(&pm->lock); + + return ret; +} + +/* return true if the new status bit is currently cleared, that is, this event + * can be server, eventually by an already scheduled work + */ +static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, + enum mptcp_pm_status new_status) +{ + pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status, + BIT(new_status)); + if (msk->pm.status & BIT(new_status)) + return false; + + msk->pm.status |= BIT(new_status); + if (queue_work(pm_wq, &msk->pm.work)) + sock_hold((struct sock *)msk); + return true; +} + +void mptcp_pm_fully_established(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + + pr_debug("msk=%p", msk); + + /* try to avoid acquiring the lock below */ + if (!READ_ONCE(pm->work_pending)) + return; + + spin_lock_bh(&pm->lock); + + if (READ_ONCE(pm->work_pending)) + mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); + + spin_unlock_bh(&pm->lock); +} + +void mptcp_pm_connection_closed(struct mptcp_sock *msk) +{ + pr_debug("msk=%p", msk); +} + +void mptcp_pm_subflow_established(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + struct mptcp_pm_data *pm = &msk->pm; + + pr_debug("msk=%p", msk); + + if (!READ_ONCE(pm->work_pending)) + return; + + spin_lock_bh(&pm->lock); + + if (READ_ONCE(pm->work_pending)) + mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); + + spin_unlock_bh(&pm->lock); +} + +void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id) +{ + pr_debug("msk=%p", msk); +} + +void mptcp_pm_add_addr_received(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_data *pm = &msk->pm; + + pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id, + READ_ONCE(pm->accept_addr)); + + /* avoid acquiring the lock if there is no room for fouther addresses */ + if (!READ_ONCE(pm->accept_addr)) + return; + + spin_lock_bh(&pm->lock); + + /* be sure there is something to signal re-checking under PM lock */ + if (READ_ONCE(pm->accept_addr) && + mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) + pm->remote = *addr; + + spin_unlock_bh(&pm->lock); +} + +/* path manager helpers */ + +bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, + struct mptcp_addr_info *saddr) +{ + int ret = false; + + spin_lock_bh(&msk->pm.lock); + + /* double check after the lock is acquired */ + if (!mptcp_pm_should_signal(msk)) + goto out_unlock; + + if (remaining < mptcp_add_addr_len(msk->pm.local.family)) + goto out_unlock; + + *saddr = msk->pm.local; + WRITE_ONCE(msk->pm.addr_signal, false); + ret = true; + +out_unlock: + spin_unlock_bh(&msk->pm.lock); + return ret; +} + +int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) +{ + return mptcp_pm_nl_get_local_id(msk, skc); +} + +static void pm_worker(struct work_struct *work) +{ + struct mptcp_pm_data *pm = container_of(work, struct mptcp_pm_data, + work); + struct mptcp_sock *msk = container_of(pm, struct mptcp_sock, pm); + struct sock *sk = (struct sock *)msk; + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + + pr_debug("msk=%p status=%x", msk, pm->status); + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); + mptcp_pm_nl_add_addr_received(msk); + } + if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); + mptcp_pm_nl_fully_established(msk); + } + if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); + mptcp_pm_nl_subflow_established(msk); + } + + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + sock_put(sk); +} + +void mptcp_pm_data_init(struct mptcp_sock *msk) +{ + msk->pm.add_addr_signaled = 0; + msk->pm.add_addr_accepted = 0; + msk->pm.local_addr_used = 0; + msk->pm.subflows = 0; + WRITE_ONCE(msk->pm.work_pending, false); + WRITE_ONCE(msk->pm.addr_signal, false); + WRITE_ONCE(msk->pm.accept_addr, false); + WRITE_ONCE(msk->pm.accept_subflow, false); + msk->pm.status = 0; + + spin_lock_init(&msk->pm.lock); + INIT_WORK(&msk->pm.work, pm_worker); + + mptcp_pm_nl_data_init(msk); +} + +void mptcp_pm_close(struct mptcp_sock *msk) +{ + if (cancel_work_sync(&msk->pm.work)) + sock_put((struct sock *)msk); +} + +void mptcp_pm_init(void) +{ + pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); + if (!pm_wq) + panic("Failed to allocate workqueue"); + + mptcp_pm_nl_init(); +} diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c new file mode 100644 index 000000000000..a0ce7f324499 --- /dev/null +++ b/net/mptcp/pm_netlink.c @@ -0,0 +1,857 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2020, Red Hat, Inc. + */ + +#include <linux/inet.h> +#include <linux/kernel.h> +#include <net/tcp.h> +#include <net/netns/generic.h> +#include <net/mptcp.h> +#include <net/genetlink.h> +#include <uapi/linux/mptcp.h> + +#include "protocol.h" + +/* forward declaration */ +static struct genl_family mptcp_genl_family; + +static int pm_nl_pernet_id; + +struct mptcp_pm_addr_entry { + struct list_head list; + unsigned int flags; + int ifindex; + struct mptcp_addr_info addr; + struct rcu_head rcu; +}; + +struct pm_nl_pernet { + /* protects pernet updates */ + spinlock_t lock; + struct list_head local_addr_list; + unsigned int addrs; + unsigned int add_addr_signal_max; + unsigned int add_addr_accept_max; + unsigned int local_addr_max; + unsigned int subflows_max; + unsigned int next_id; +}; + +#define MPTCP_PM_ADDR_MAX 8 + +static bool addresses_equal(const struct mptcp_addr_info *a, + struct mptcp_addr_info *b, bool use_port) +{ + bool addr_equals = false; + + if (a->family != b->family) + return false; + + if (a->family == AF_INET) + addr_equals = a->addr.s_addr == b->addr.s_addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else + addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6); +#endif + + if (!addr_equals) + return false; + if (!use_port) + return true; + + return a->port == b->port; +} + +static void local_address(const struct sock_common *skc, + struct mptcp_addr_info *addr) +{ + addr->port = 0; + addr->family = skc->skc_family; + if (addr->family == AF_INET) + addr->addr.s_addr = skc->skc_rcv_saddr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + addr->addr6 = skc->skc_v6_rcv_saddr; +#endif +} + +static void remote_address(const struct sock_common *skc, + struct mptcp_addr_info *addr) +{ + addr->family = skc->skc_family; + addr->port = skc->skc_dport; + if (addr->family == AF_INET) + addr->addr.s_addr = skc->skc_daddr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + addr->addr6 = skc->skc_v6_daddr; +#endif +} + +static bool lookup_subflow_by_saddr(const struct list_head *list, + struct mptcp_addr_info *saddr) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info cur; + struct sock_common *skc; + + list_for_each_entry(subflow, list, node) { + skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); + + local_address(skc, &cur); + if (addresses_equal(&cur, saddr, false)) + return true; + } + + return false; +} + +static struct mptcp_pm_addr_entry * +select_local_address(const struct pm_nl_pernet *pernet, + struct mptcp_sock *msk) +{ + struct mptcp_pm_addr_entry *entry, *ret = NULL; + + rcu_read_lock(); + spin_lock_bh(&msk->join_list_lock); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) + continue; + + /* avoid any address already in use by subflows and + * pending join + */ + if (entry->addr.family == ((struct sock *)msk)->sk_family && + !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && + !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) { + ret = entry; + break; + } + } + spin_unlock_bh(&msk->join_list_lock); + rcu_read_unlock(); + return ret; +} + +static struct mptcp_pm_addr_entry * +select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos) +{ + struct mptcp_pm_addr_entry *entry, *ret = NULL; + int i = 0; + + rcu_read_lock(); + /* do not keep any additional per socket state, just signal + * the address list in order. + * Note: removal from the local address list during the msk life-cycle + * can lead to additional addresses not being announced. + */ + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) + continue; + if (i++ == pos) { + ret = entry; + break; + } + } + rcu_read_unlock(); + return ret; +} + +static void check_work_pending(struct mptcp_sock *msk) +{ + if (msk->pm.add_addr_signaled == msk->pm.add_addr_signal_max && + (msk->pm.local_addr_used == msk->pm.local_addr_max || + msk->pm.subflows == msk->pm.subflows_max)) + WRITE_ONCE(msk->pm.work_pending, false); +} + +static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *local; + struct mptcp_addr_info remote; + struct pm_nl_pernet *pernet; + + pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + + pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", + msk->pm.local_addr_used, msk->pm.local_addr_max, + msk->pm.add_addr_signaled, msk->pm.add_addr_signal_max, + msk->pm.subflows, msk->pm.subflows_max); + + /* check first for announce */ + if (msk->pm.add_addr_signaled < msk->pm.add_addr_signal_max) { + local = select_signal_address(pernet, + msk->pm.add_addr_signaled); + + if (local) { + msk->pm.add_addr_signaled++; + mptcp_pm_announce_addr(msk, &local->addr); + } else { + /* pick failed, avoid fourther attempts later */ + msk->pm.local_addr_used = msk->pm.add_addr_signal_max; + } + + check_work_pending(msk); + } + + /* check if should create a new subflow */ + if (msk->pm.local_addr_used < msk->pm.local_addr_max && + msk->pm.subflows < msk->pm.subflows_max) { + remote_address((struct sock_common *)sk, &remote); + + local = select_local_address(pernet, msk); + if (local) { + msk->pm.local_addr_used++; + msk->pm.subflows++; + check_work_pending(msk); + spin_unlock_bh(&msk->pm.lock); + __mptcp_subflow_connect(sk, local->ifindex, + &local->addr, &remote); + spin_lock_bh(&msk->pm.lock); + return; + } + + /* lookup failed, avoid fourther attempts later */ + msk->pm.local_addr_used = msk->pm.local_addr_max; + check_work_pending(msk); + } +} + +void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) +{ + mptcp_pm_create_subflow_or_signal_addr(msk); +} + +void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) +{ + mptcp_pm_create_subflow_or_signal_addr(msk); +} + +void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info remote; + struct mptcp_addr_info local; + + pr_debug("accepted %d:%d remote family %d", + msk->pm.add_addr_accepted, msk->pm.add_addr_accept_max, + msk->pm.remote.family); + msk->pm.add_addr_accepted++; + msk->pm.subflows++; + if (msk->pm.add_addr_accepted >= msk->pm.add_addr_accept_max || + msk->pm.subflows >= msk->pm.subflows_max) + WRITE_ONCE(msk->pm.accept_addr, false); + + /* connect to the specified remote address, using whatever + * local address the routing configuration will pick. + */ + remote = msk->pm.remote; + if (!remote.port) + remote.port = sk->sk_dport; + memset(&local, 0, sizeof(local)); + local.family = remote.family; + + spin_unlock_bh(&msk->pm.lock); + __mptcp_subflow_connect((struct sock *)msk, 0, &local, &remote); + spin_lock_bh(&msk->pm.lock); +} + +static bool address_use_port(struct mptcp_pm_addr_entry *entry) +{ + return (entry->flags & + (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == + MPTCP_PM_ADDR_FLAG_SIGNAL; +} + +static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, + struct mptcp_pm_addr_entry *entry) +{ + struct mptcp_pm_addr_entry *cur; + int ret = -EINVAL; + + spin_lock_bh(&pernet->lock); + /* to keep the code simple, don't do IDR-like allocation for address ID, + * just bail when we exceed limits + */ + if (pernet->next_id > 255) + goto out; + if (pernet->addrs >= MPTCP_PM_ADDR_MAX) + goto out; + + /* do not insert duplicate address, differentiate on port only + * singled addresses + */ + list_for_each_entry(cur, &pernet->local_addr_list, list) { + if (addresses_equal(&cur->addr, &entry->addr, + address_use_port(entry) && + address_use_port(cur))) + goto out; + } + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) + pernet->add_addr_signal_max++; + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + pernet->local_addr_max++; + + entry->addr.id = pernet->next_id++; + pernet->addrs++; + list_add_tail_rcu(&entry->list, &pernet->local_addr_list); + ret = entry->addr.id; + +out: + spin_unlock_bh(&pernet->lock); + return ret; +} + +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) +{ + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info skc_local; + struct mptcp_addr_info msk_local; + struct pm_nl_pernet *pernet; + int ret = -1; + + if (WARN_ON_ONCE(!msk)) + return -1; + + /* The 0 ID mapping is defined by the first subflow, copied into the msk + * addr + */ + local_address((struct sock_common *)msk, &msk_local); + local_address((struct sock_common *)msk, &skc_local); + if (addresses_equal(&msk_local, &skc_local, false)) + return 0; + + pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (addresses_equal(&entry->addr, &skc_local, false)) { + ret = entry->addr.id; + break; + } + } + rcu_read_unlock(); + if (ret >= 0) + return ret; + + /* address not found, add to local list */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->flags = 0; + entry->addr = skc_local; + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); + if (ret < 0) + kfree(entry); + + return ret; +} + +void mptcp_pm_nl_data_init(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + struct pm_nl_pernet *pernet; + bool subflows; + + pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + + pm->add_addr_signal_max = READ_ONCE(pernet->add_addr_signal_max); + pm->add_addr_accept_max = READ_ONCE(pernet->add_addr_accept_max); + pm->local_addr_max = READ_ONCE(pernet->local_addr_max); + pm->subflows_max = READ_ONCE(pernet->subflows_max); + subflows = !!pm->subflows_max; + WRITE_ONCE(pm->work_pending, (!!pm->local_addr_max && subflows) || + !!pm->add_addr_signal_max); + WRITE_ONCE(pm->accept_addr, !!pm->add_addr_accept_max && subflows); + WRITE_ONCE(pm->accept_subflow, subflows); +} + +#define MPTCP_PM_CMD_GRP_OFFSET 0 + +static const struct genl_multicast_group mptcp_pm_mcgrps[] = { + [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, }, +}; + +static const struct nla_policy +mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = { + [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, }, + [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, }, + [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, }, + [MPTCP_PM_ADDR_ATTR_ADDR6] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct in6_addr), }, + [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 }, + [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 }, + [MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 }, +}; + +static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = { + [MPTCP_PM_ATTR_ADDR] = + NLA_POLICY_NESTED(mptcp_pm_addr_policy), + [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, }, + [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, }, +}; + +static int mptcp_pm_family_to_addr(int family) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (family == AF_INET6) + return MPTCP_PM_ADDR_ATTR_ADDR6; +#endif + return MPTCP_PM_ADDR_ATTR_ADDR4; +} + +static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, + bool require_family, + struct mptcp_pm_addr_entry *entry) +{ + struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + int err, addr_addr; + + if (!attr) { + GENL_SET_ERR_MSG(info, "missing address info"); + return -EINVAL; + } + + /* no validation needed - was already done via nested policy */ + err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, + mptcp_pm_addr_policy, info->extack); + if (err) + return err; + + memset(entry, 0, sizeof(*entry)); + if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) { + if (!require_family) + goto skip_family; + + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "missing family"); + return -EINVAL; + } + + entry->addr.family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); + if (entry->addr.family != AF_INET +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + && entry->addr.family != AF_INET6 +#endif + ) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "unknown address family"); + return -EINVAL; + } + addr_addr = mptcp_pm_family_to_addr(entry->addr.family); + if (!tb[addr_addr]) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "missing address data"); + return -EINVAL; + } + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (entry->addr.family == AF_INET6) + entry->addr.addr6 = nla_get_in6_addr(tb[addr_addr]); + else +#endif + entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]); + +skip_family: + if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) + entry->ifindex = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); + + if (tb[MPTCP_PM_ADDR_ATTR_ID]) + entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); + + if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) + entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); + + return 0; +} + +static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) +{ + return net_generic(genl_info_net(info), pm_nl_pernet_id); +} + +static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + int ret; + + ret = mptcp_pm_parse_addr(attr, info, true, &addr); + if (ret < 0) + return ret; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + GENL_SET_ERR_MSG(info, "can't allocate addr"); + return -ENOMEM; + } + + *entry = addr; + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); + if (ret < 0) { + GENL_SET_ERR_MSG(info, "too many addresses or duplicate one"); + kfree(entry); + return ret; + } + + return 0; +} + +static struct mptcp_pm_addr_entry * +__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, &pernet->local_addr_list, list) { + if (entry->addr.id == id) + return entry; + } + return NULL; +} + +static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + int ret; + + ret = mptcp_pm_parse_addr(attr, info, false, &addr); + if (ret < 0) + return ret; + + spin_lock_bh(&pernet->lock); + entry = __lookup_addr_by_id(pernet, addr.addr.id); + if (!entry) { + GENL_SET_ERR_MSG(info, "address not found"); + ret = -EINVAL; + goto out; + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) + pernet->add_addr_signal_max--; + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + pernet->local_addr_max--; + + pernet->addrs--; + list_del_rcu(&entry->list); + kfree_rcu(entry, rcu); +out: + spin_unlock_bh(&pernet->lock); + return ret; +} + +static void __flush_addrs(struct pm_nl_pernet *pernet) +{ + while (!list_empty(&pernet->local_addr_list)) { + struct mptcp_pm_addr_entry *cur; + + cur = list_entry(pernet->local_addr_list.next, + struct mptcp_pm_addr_entry, list); + list_del_rcu(&cur->list); + kfree_rcu(cur, rcu); + } +} + +static void __reset_counters(struct pm_nl_pernet *pernet) +{ + pernet->add_addr_signal_max = 0; + pernet->add_addr_accept_max = 0; + pernet->local_addr_max = 0; + pernet->addrs = 0; +} + +static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + + spin_lock_bh(&pernet->lock); + __flush_addrs(pernet); + __reset_counters(pernet); + spin_unlock_bh(&pernet->lock); + return 0; +} + +static int mptcp_nl_fill_addr(struct sk_buff *skb, + struct mptcp_pm_addr_entry *entry) +{ + struct mptcp_addr_info *addr = &entry->addr; + struct nlattr *attr; + + attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family)) + goto nla_put_failure; + if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id)) + goto nla_put_failure; + if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags)) + goto nla_put_failure; + if (entry->ifindex && + nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex)) + goto nla_put_failure; + + if (addr->family == AF_INET) + nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4, + addr->addr.s_addr); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6); +#endif + nla_nest_end(skb, attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, attr); + return -EMSGSIZE; +} + +static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + struct sk_buff *msg; + void *reply; + int ret; + + ret = mptcp_pm_parse_addr(attr, info, false, &addr); + if (ret < 0) + return ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, + info->genlhdr->cmd); + if (!reply) { + GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); + ret = -EMSGSIZE; + goto fail; + } + + spin_lock_bh(&pernet->lock); + entry = __lookup_addr_by_id(pernet, addr.addr.id); + if (!entry) { + GENL_SET_ERR_MSG(info, "address not found"); + ret = -EINVAL; + goto unlock_fail; + } + + ret = mptcp_nl_fill_addr(msg, entry); + if (ret) + goto unlock_fail; + + genlmsg_end(msg, reply); + ret = genlmsg_reply(msg, info); + spin_unlock_bh(&pernet->lock); + return ret; + +unlock_fail: + spin_unlock_bh(&pernet->lock); + +fail: + nlmsg_free(msg); + return ret; +} + +static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct net *net = sock_net(msg->sk); + struct mptcp_pm_addr_entry *entry; + struct pm_nl_pernet *pernet; + int id = cb->args[0]; + void *hdr; + + pernet = net_generic(net, pm_nl_pernet_id); + + spin_lock_bh(&pernet->lock); + list_for_each_entry(entry, &pernet->local_addr_list, list) { + if (entry->addr.id <= id) + continue; + + hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &mptcp_genl_family, + NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); + if (!hdr) + break; + + if (mptcp_nl_fill_addr(msg, entry) < 0) { + genlmsg_cancel(msg, hdr); + break; + } + + id = entry->addr.id; + genlmsg_end(msg, hdr); + } + spin_unlock_bh(&pernet->lock); + + cb->args[0] = id; + return msg->len; +} + +static int parse_limit(struct genl_info *info, int id, unsigned int *limit) +{ + struct nlattr *attr = info->attrs[id]; + + if (!attr) + return 0; + + *limit = nla_get_u32(attr); + if (*limit > MPTCP_PM_ADDR_MAX) { + GENL_SET_ERR_MSG(info, "limit greater than maximum"); + return -EINVAL; + } + return 0; +} + +static int +mptcp_nl_cmd_set_limits(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + unsigned int rcv_addrs, subflows; + int ret; + + spin_lock_bh(&pernet->lock); + rcv_addrs = pernet->add_addr_accept_max; + ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); + if (ret) + goto unlock; + + subflows = pernet->subflows_max; + ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); + if (ret) + goto unlock; + + WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); + WRITE_ONCE(pernet->subflows_max, subflows); + +unlock: + spin_unlock_bh(&pernet->lock); + return ret; +} + +static int +mptcp_nl_cmd_get_limits(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct sk_buff *msg; + void *reply; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, + MPTCP_PM_CMD_GET_LIMITS); + if (!reply) + goto fail; + + if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, + READ_ONCE(pernet->add_addr_accept_max))) + goto fail; + + if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, + READ_ONCE(pernet->subflows_max))) + goto fail; + + genlmsg_end(msg, reply); + return genlmsg_reply(msg, info); + +fail: + GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); + nlmsg_free(msg); + return -EMSGSIZE; +} + +static struct genl_ops mptcp_pm_ops[] = { + { + .cmd = MPTCP_PM_CMD_ADD_ADDR, + .doit = mptcp_nl_cmd_add_addr, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_DEL_ADDR, + .doit = mptcp_nl_cmd_del_addr, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_FLUSH_ADDRS, + .doit = mptcp_nl_cmd_flush_addrs, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_GET_ADDR, + .doit = mptcp_nl_cmd_get_addr, + .dumpit = mptcp_nl_cmd_dump_addrs, + }, + { + .cmd = MPTCP_PM_CMD_SET_LIMITS, + .doit = mptcp_nl_cmd_set_limits, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_GET_LIMITS, + .doit = mptcp_nl_cmd_get_limits, + }, +}; + +static struct genl_family mptcp_genl_family __ro_after_init = { + .name = MPTCP_PM_NAME, + .version = MPTCP_PM_VER, + .maxattr = MPTCP_PM_ATTR_MAX, + .policy = mptcp_pm_policy, + .netnsok = true, + .module = THIS_MODULE, + .ops = mptcp_pm_ops, + .n_ops = ARRAY_SIZE(mptcp_pm_ops), + .mcgrps = mptcp_pm_mcgrps, + .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps), +}; + +static int __net_init pm_nl_init_net(struct net *net) +{ + struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); + + INIT_LIST_HEAD_RCU(&pernet->local_addr_list); + __reset_counters(pernet); + pernet->next_id = 1; + spin_lock_init(&pernet->lock); + return 0; +} + +static void __net_exit pm_nl_exit_net(struct list_head *net_list) +{ + struct net *net; + + list_for_each_entry(net, net_list, exit_list) { + /* net is removed from namespace list, can't race with + * other modifiers + */ + __flush_addrs(net_generic(net, pm_nl_pernet_id)); + } +} + +static struct pernet_operations mptcp_pm_pernet_ops = { + .init = pm_nl_init_net, + .exit_batch = pm_nl_exit_net, + .id = &pm_nl_pernet_id, + .size = sizeof(struct pm_nl_pernet), +}; + +void mptcp_pm_nl_init(void) +{ + if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) + panic("Failed to register MPTCP PM pernet subsystem.\n"); + + if (genl_register_family(&mptcp_genl_family)) + panic("Failed to register MPTCP PM netlink family\n"); +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 04c3caed92df..1833bc1f4a43 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -21,6 +21,7 @@ #endif #include <net/mptcp.h> #include "protocol.h" +#include "mib.h" #define MPTCP_SAME_STATE TCP_MAX_STATES @@ -37,6 +38,8 @@ struct mptcp_skb_cb { #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) +static struct percpu_counter mptcp_sockets_allocated; + /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. * Otherwise return NULL. @@ -104,19 +107,6 @@ set_state: return ssock; } -static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - sock_owned_by_me((const struct sock *)msk); - - mptcp_for_each_subflow(msk, subflow) { - return mptcp_subflow_tcp_sock(subflow); - } - - return NULL; -} - static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, struct sk_buff *skb, unsigned int offset, size_t copy_len) @@ -254,6 +244,60 @@ wake: sk->sk_data_ready(sk); } +static void __mptcp_flush_join_list(struct mptcp_sock *msk) +{ + if (likely(list_empty(&msk->join_list))) + return; + + spin_lock_bh(&msk->join_list_lock); + list_splice_tail_init(&msk->join_list, &msk->conn_list); + spin_unlock_bh(&msk->join_list_lock); +} + +static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) +{ + long tout = ssk && inet_csk(ssk)->icsk_pending ? + inet_csk(ssk)->icsk_timeout - jiffies : 0; + + if (tout <= 0) + tout = mptcp_sk(sk)->timer_ival; + mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; +} + +static bool mptcp_timer_pending(struct sock *sk) +{ + return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); +} + +static void mptcp_reset_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + unsigned long tout; + + /* should never be called with mptcp level timer cleared */ + tout = READ_ONCE(mptcp_sk(sk)->timer_ival); + if (WARN_ON_ONCE(!tout)) + tout = TCP_RTO_MIN; + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); +} + +void mptcp_data_acked(struct sock *sk) +{ + mptcp_reset_timer(sk); + + if (!sk_stream_is_writeable(sk) && + schedule_work(&mptcp_sk(sk)->work)) + sock_hold(sk); +} + +static void mptcp_stop_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + mptcp_sk(sk)->timer_ival = 0; +} + static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) { if (!msk->cached_ext) @@ -277,41 +321,149 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) return NULL; } -static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, - const struct sk_buff *skb, - const struct mptcp_ext *mpext) +static bool mptcp_skb_can_collapse_to(u64 write_seq, + const struct sk_buff *skb, + const struct mptcp_ext *mpext) { if (!tcp_skb_can_collapse_to(skb)) return false; /* can collapse only if MPTCP level sequence is in order */ - return mpext && mpext->data_seq + mpext->data_len == msk->write_seq; + return mpext && mpext->data_seq + mpext->data_len == write_seq; +} + +static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, + const struct page_frag *pfrag, + const struct mptcp_data_frag *df) +{ + return df && pfrag->page == df->page && + df->data_seq + df->data_len == msk->write_seq; +} + +static void dfrag_uncharge(struct sock *sk, int len) +{ + sk_mem_uncharge(sk, len); + sk_wmem_queued_add(sk, -len); +} + +static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) +{ + int len = dfrag->data_len + dfrag->overhead; + + list_del(&dfrag->list); + dfrag_uncharge(sk, len); + put_page(dfrag->page); +} + +static void mptcp_clean_una(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_data_frag *dtmp, *dfrag; + u64 snd_una = atomic64_read(&msk->snd_una); + bool cleaned = false; + + list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { + if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) + break; + + dfrag_clear(sk, dfrag); + cleaned = true; + } + + dfrag = mptcp_rtx_head(sk); + if (dfrag && after64(snd_una, dfrag->data_seq)) { + u64 delta = dfrag->data_seq + dfrag->data_len - snd_una; + + dfrag->data_seq += delta; + dfrag->data_len -= delta; + + dfrag_uncharge(sk, delta); + cleaned = true; + } + + if (cleaned) { + sk_mem_reclaim_partial(sk); + + /* Only wake up writers if a subflow is ready */ + if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) + sk_stream_write_space(sk); + } +} + +/* ensure we get enough memory for the frag hdr, beyond some minimal amount of + * data + */ +static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), + pfrag, sk->sk_allocation))) + return true; + + sk->sk_prot->enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; +} + +static struct mptcp_data_frag * +mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, + int orig_offset) +{ + int offset = ALIGN(orig_offset, sizeof(long)); + struct mptcp_data_frag *dfrag; + + dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); + dfrag->data_len = 0; + dfrag->data_seq = msk->write_seq; + dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); + dfrag->offset = offset + sizeof(struct mptcp_data_frag); + dfrag->page = pfrag->page; + + return dfrag; } static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, - struct msghdr *msg, long *timeo, int *pmss_now, + struct msghdr *msg, struct mptcp_data_frag *dfrag, + long *timeo, int *pmss_now, int *ps_goal) { - int mss_now, avail_size, size_goal, ret; + int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0; + bool dfrag_collapsed, can_collapse = false; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_ext *mpext = NULL; + bool retransmission = !!dfrag; struct sk_buff *skb, *tail; - bool can_collapse = false; struct page_frag *pfrag; + struct page *page; + u64 *write_seq; size_t psize; /* use the mptcp page cache so that we can easily move the data * from one substream to another, but do per subflow memory accounting + * Note: pfrag is used only !retransmission, but the compiler if + * fooled into a warning if we don't init here */ pfrag = sk_page_frag(sk); - while (!sk_page_frag_refill(ssk, pfrag) || + while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) || !mptcp_ext_cache_refill(msk)) { ret = sk_stream_wait_memory(ssk, timeo); if (ret) return ret; + + /* if sk_stream_wait_memory() sleeps snd_una can change + * significantly, refresh the rtx queue + */ + mptcp_clean_una(sk); + if (unlikely(__mptcp_needs_tcp_fallback(msk))) return 0; } + if (!retransmission) { + write_seq = &msk->write_seq; + page = pfrag->page; + } else { + write_seq = &dfrag->data_seq; + page = dfrag->page; + } /* compute copy limit */ mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); @@ -329,32 +481,74 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * SSN association set here */ can_collapse = (size_goal - skb->len > 0) && - mptcp_skb_can_collapse_to(msk, skb, mpext); + mptcp_skb_can_collapse_to(*write_seq, skb, mpext); if (!can_collapse) TCP_SKB_CB(skb)->eor = 1; else avail_size = size_goal - skb->len; } - psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size); - - /* Copy to page */ - pr_debug("left=%zu", msg_data_left(msg)); - psize = copy_page_from_iter(pfrag->page, pfrag->offset, - min_t(size_t, msg_data_left(msg), psize), - &msg->msg_iter); - pr_debug("left=%zu", msg_data_left(msg)); - if (!psize) - return -EINVAL; + + if (!retransmission) { + /* reuse tail pfrag, if possible, or carve a new one from the + * page allocator + */ + dfrag = mptcp_rtx_tail(sk); + offset = pfrag->offset; + dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); + if (!dfrag_collapsed) { + dfrag = mptcp_carve_data_frag(msk, pfrag, offset); + offset = dfrag->offset; + frag_truesize = dfrag->overhead; + } + psize = min_t(size_t, pfrag->size - offset, avail_size); + + /* Copy to page */ + pr_debug("left=%zu", msg_data_left(msg)); + psize = copy_page_from_iter(pfrag->page, offset, + min_t(size_t, msg_data_left(msg), + psize), + &msg->msg_iter); + pr_debug("left=%zu", msg_data_left(msg)); + if (!psize) + return -EINVAL; + + if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) + return -ENOMEM; + } else { + offset = dfrag->offset; + psize = min_t(size_t, dfrag->data_len, avail_size); + } /* tell the TCP stack to delay the push so that we can safely * access the skb after the sendpages call */ - ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize, + ret = do_tcp_sendpages(ssk, page, offset, psize, msg->msg_flags | MSG_SENDPAGE_NOTLAST); if (ret <= 0) return ret; - if (unlikely(ret < psize)) - iov_iter_revert(&msg->msg_iter, psize - ret); + + frag_truesize += ret; + if (!retransmission) { + if (unlikely(ret < psize)) + iov_iter_revert(&msg->msg_iter, psize - ret); + + /* send successful, keep track of sent data for mptcp-level + * retransmission + */ + dfrag->data_len += ret; + if (!dfrag_collapsed) { + get_page(dfrag->page); + list_add_tail(&dfrag->list, &msk->rtx_queue); + sk_wmem_queued_add(sk, frag_truesize); + } else { + sk_wmem_queued_add(sk, ret); + } + + /* charge data on mptcp rtx queue to the master socket + * Note: we charge such data both to sk and ssk + */ + sk->sk_forward_alloc -= frag_truesize; + } /* if the tail skb extension is still the cached one, collapsing * really happened. Note: we can't check for 'same skb' as the sk_buff @@ -373,7 +567,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, msk->cached_ext = NULL; memset(mpext, 0, sizeof(*mpext)); - mpext->data_seq = msk->write_seq; + mpext->data_seq = *write_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = ret; mpext->use_map = 1; @@ -384,13 +578,51 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->dsn64); out: - pfrag->offset += ret; - msk->write_seq += ret; + if (!retransmission) + pfrag->offset += frag_truesize; + *write_seq += ret; mptcp_subflow_ctx(ssk)->rel_write_seq += ret; return ret; } +static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *backup = NULL; + + sock_owned_by_me((const struct sock *)msk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (!sk_stream_memory_free(ssk)) { + struct socket *sock = ssk->sk_socket; + + if (sock) { + clear_bit(MPTCP_SEND_SPACE, &msk->flags); + smp_mb__after_atomic(); + + /* enables sk->write_space() callbacks */ + set_bit(SOCK_NOSPACE, &sock->flags); + } + + return NULL; + } + + if (subflow->backup) { + if (!backup) + backup = ssk; + + continue; + } + + return ssk; + } + + return backup; +} + static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) { struct socket *sock; @@ -438,17 +670,29 @@ fallback: return ret >= 0 ? ret + copied : (copied ? copied : ret); } - ssk = mptcp_subflow_get(msk); - if (!ssk) { - release_sock(sk); - return -ENOTCONN; + mptcp_clean_una(sk); + + __mptcp_flush_join_list(msk); + ssk = mptcp_subflow_get_send(msk); + while (!sk_stream_memory_free(sk) || !ssk) { + ret = sk_stream_wait_memory(sk, &timeo); + if (ret) + goto out; + + mptcp_clean_una(sk); + + ssk = mptcp_subflow_get_send(msk); + if (list_empty(&msk->conn_list)) { + ret = -ENOTCONN; + goto out; + } } pr_debug("conn_list->subflow=%p", ssk); lock_sock(ssk); while (msg_data_left(msg)) { - ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now, + ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, &size_goal); if (ret < 0) break; @@ -461,10 +705,15 @@ fallback: copied += ret; } + mptcp_set_timeout(sk, ssk); if (copied) { ret = copied; tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal); + + /* start the timer, if it's not pending */ + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); } ssk_check_wmem(msk, ssk); @@ -572,6 +821,7 @@ fallback: len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + __mptcp_flush_join_list(msk); while (len > (size_t)copied) { int bytes_read; @@ -651,6 +901,69 @@ out_err: return copied; } +static void mptcp_retransmit_handler(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (atomic64_read(&msk->snd_una) == msk->write_seq) { + mptcp_stop_timer(sk); + } else { + set_bit(MPTCP_WORK_RTX, &msk->flags); + if (schedule_work(&msk->work)) + sock_hold(sk); + } +} + +static void mptcp_retransmit_timer(struct timer_list *t) +{ + struct inet_connection_sock *icsk = from_timer(icsk, t, + icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + mptcp_retransmit_handler(sk); + } else { + /* delegate our work to tcp_release_cb() */ + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, + &sk->sk_tsq_flags)) + sock_hold(sk); + } + bh_unlock_sock(sk); + sock_put(sk); +} + +/* Find an idle subflow. Return NULL if there is unacked data at tcp + * level. + * + * A backup subflow is returned only if that is the only kind available. + */ +static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *backup = NULL; + + sock_owned_by_me((const struct sock *)msk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + /* still data outstanding at TCP level? Don't retransmit. */ + if (!tcp_write_queue_empty(ssk)) + return NULL; + + if (subflow->backup) { + if (!backup) + backup = ssk; + continue; + } + + return ssk; + } + + return backup; +} + /* subflow sockets can be either outgoing (connect) or incoming * (accept). * @@ -684,10 +997,63 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); - struct sock *sk = &msk->sk.icsk_inet.sk; + struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; + int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0; + struct mptcp_data_frag *dfrag; + u64 orig_write_seq; + size_t copied = 0; + struct msghdr msg; + long timeo = 0; lock_sock(sk); + mptcp_clean_una(sk); + __mptcp_flush_join_list(msk); __mptcp_move_skbs(msk); + + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) + goto unlock; + + dfrag = mptcp_rtx_head(sk); + if (!dfrag) + goto unlock; + + ssk = mptcp_subflow_get_retrans(msk); + if (!ssk) + goto reset_unlock; + + lock_sock(ssk); + + msg.msg_flags = MSG_DONTWAIT; + orig_len = dfrag->data_len; + orig_offset = dfrag->offset; + orig_write_seq = dfrag->data_seq; + while (dfrag->data_len > 0) { + ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now, + &size_goal); + if (ret < 0) + break; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); + copied += ret; + dfrag->data_len -= ret; + dfrag->offset += ret; + } + if (copied) + tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, + size_goal); + + dfrag->data_seq = orig_write_seq; + dfrag->offset = orig_offset; + dfrag->data_len = orig_len; + + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + +reset_unlock: + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + +unlock: release_sock(sk); sock_put(sk); } @@ -696,22 +1062,55 @@ static int __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); + spin_lock_init(&msk->join_list_lock); + INIT_LIST_HEAD(&msk->conn_list); + INIT_LIST_HEAD(&msk->join_list); + INIT_LIST_HEAD(&msk->rtx_queue); __set_bit(MPTCP_SEND_SPACE, &msk->flags); INIT_WORK(&msk->work, mptcp_worker); msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; + mptcp_pm_data_init(msk); + + /* re-use the csk retrans timer for MPTCP-level retrans */ + timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); + return 0; } static int mptcp_init_sock(struct sock *sk) { - if (!mptcp_is_enabled(sock_net(sk))) + struct net *net = sock_net(sk); + int ret; + + if (!mptcp_is_enabled(net)) return -ENOPROTOOPT; - return __mptcp_init_sock(sk); + if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) + return -ENOMEM; + + ret = __mptcp_init_sock(sk); + if (ret) + return ret; + + sk_sockets_allocated_inc(sk); + sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; + + return 0; +} + +static void __mptcp_clear_xmit(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_data_frag *dtmp, *dfrag; + + sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + + list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) + dfrag_clear(sk, dfrag); } static void mptcp_cancel_work(struct sock *sk) @@ -767,10 +1166,14 @@ static void mptcp_close(struct sock *sk, long timeout) mptcp_token_destroy(msk->token); inet_sk_state_store(sk, TCP_CLOSE); + __mptcp_flush_join_list(msk); + list_splice_init(&msk->conn_list, &conn_list); data_fin_tx_seq = msk->write_seq; + __mptcp_clear_xmit(sk); + release_sock(sk); list_for_each_entry_safe(subflow, tmp, &conn_list, node) { @@ -782,6 +1185,7 @@ static void mptcp_close(struct sock *sk, long timeout) } mptcp_cancel_work(sk); + mptcp_pm_close(msk); __skb_queue_purge(&sk->sk_receive_queue); @@ -811,6 +1215,15 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; } +static int mptcp_disconnect(struct sock *sk, int flags) +{ + lock_sock(sk); + __mptcp_clear_xmit(sk); + release_sock(sk); + mptcp_cancel_work(sk); + return tcp_disconnect(sk, flags); +} + #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) { @@ -854,6 +1267,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req) } msk->write_seq = subflow_req->idsn + 1; + atomic64_set(&msk->snd_una, msk->write_seq); if (subflow_req->remote_key_valid) { msk->can_ack = true; msk->remote_key = subflow_req->remote_key; @@ -861,6 +1275,9 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req) ack_seq++; msk->ack_seq = ack_seq; } + + /* will be fully established after successful MPC subflow creation */ + inet_sk_state_store(nsk, TCP_SYN_RECV); bh_unlock_sock(nsk); /* keep a single reference */ @@ -916,12 +1333,13 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, mptcp_copy_inaddrs(newsk, ssk); list_add(&subflow->node, &msk->conn_list); - /* will be fully established at mptcp_stream_accept() - * completion. - */ - inet_sk_state_store(new_mptcp_sock, TCP_SYN_RECV); bh_unlock_sock(new_mptcp_sock); + + __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); local_bh_enable(); + } else { + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); } return newsk; @@ -933,6 +1351,8 @@ static void mptcp_destroy(struct sock *sk) if (msk->cached_ext) __skb_ext_put(msk->cached_ext); + + sk_sockets_allocated_dec(sk); } static int mptcp_setsockopt(struct sock *sk, int level, int optname, @@ -985,7 +1405,8 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; } -#define MPTCP_DEFERRED_ALL TCPF_DELACK_TIMER_DEFERRED +#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \ + TCPF_WRITE_TIMER_DEFERRED) /* this is very alike tcp_release_cb() but we must handle differently a * different set of events @@ -1001,6 +1422,8 @@ static void mptcp_release_cb(struct sock *sk) nflags = flags & ~MPTCP_DEFERRED_ALL; } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); + sock_release_ownership(sk); + if (flags & TCPF_DELACK_TIMER_DEFERRED) { struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; @@ -1009,6 +1432,11 @@ static void mptcp_release_cb(struct sock *sk) if (!ssk || !schedule_work(&msk->work)) __sock_put(sk); } + + if (flags & TCPF_WRITE_TIMER_DEFERRED) { + mptcp_retransmit_handler(sk); + __sock_put(sk); + } } static int mptcp_get_port(struct sock *sk, unsigned short snum) @@ -1032,13 +1460,15 @@ void mptcp_finish_connect(struct sock *ssk) u64 ack_seq; subflow = mptcp_subflow_ctx(ssk); - - if (!subflow->mp_capable) - return; - sk = subflow->conn; msk = mptcp_sk(sk); + if (!subflow->mp_capable) { + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); + return; + } + pr_debug("msk=%p, token=%u", sk, subflow->token); mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); @@ -1056,10 +1486,9 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->can_ack, 1); - if (inet_sk_state_load(sk) != TCP_ESTABLISHED) { - inet_sk_state_store(sk, TCP_ESTABLISHED); - sk->sk_state_change(sk); - } + atomic64_set(&msk->snd_una, msk->write_seq); + + mptcp_pm_new_connection(msk, 0); } static void mptcp_sock_graft(struct sock *sk, struct socket *parent) @@ -1071,6 +1500,46 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_unlock_bh(&sk->sk_callback_lock); } +bool mptcp_finish_join(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct sock *parent = (void *)msk; + struct socket *parent_sock; + bool ret; + + pr_debug("msk=%p, subflow=%p", msk, subflow); + + /* mptcp socket already closing? */ + if (inet_sk_state_load(parent) != TCP_ESTABLISHED) + return false; + + if (!msk->pm.server_side) + return true; + + /* passive connection, attach to msk socket */ + parent_sock = READ_ONCE(parent->sk_socket); + if (parent_sock && !sk->sk_socket) + mptcp_sock_graft(sk, parent_sock); + + ret = mptcp_pm_allow_new_subflow(msk); + if (ret) { + /* active connections are already on conn_list */ + spin_lock_bh(&msk->join_list_lock); + if (!WARN_ON_ONCE(!list_empty(&subflow->node))) + list_add_tail(&subflow->node, &msk->join_list); + spin_unlock_bh(&msk->join_list_lock); + } + return ret; +} + +bool mptcp_sk_is_subflow(const struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + return subflow->mp_join == 1; +} + static bool mptcp_memory_free(const struct sock *sk, int wake) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1082,6 +1551,7 @@ static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, .init = mptcp_init_sock, + .disconnect = mptcp_disconnect, .close = mptcp_close, .accept = mptcp_accept, .setsockopt = mptcp_setsockopt, @@ -1094,7 +1564,12 @@ static struct proto mptcp_prot = { .hash = inet_hash, .unhash = inet_unhash, .get_port = mptcp_get_port, + .sockets_allocated = &mptcp_sockets_allocated, + .memory_allocated = &tcp_memory_allocated, + .memory_pressure = &tcp_memory_pressure, .stream_memory_free = mptcp_memory_free, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), .no_autobind = true, }; @@ -1250,14 +1725,13 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ + __mptcp_flush_join_list(msk); list_for_each_entry(subflow, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } - - inet_sk_state_store(newsock->sk, TCP_ESTABLISHED); } sock_put(ssock->sk); @@ -1333,6 +1807,7 @@ static int mptcp_shutdown(struct socket *sock, int how) sock->state = SS_CONNECTED; } + __mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -1383,7 +1858,11 @@ void mptcp_proto_init(void) { mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; + if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) + panic("Failed to allocate MPTCP pcpu counter\n"); + mptcp_subflow_init(); + mptcp_pm_init(); if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 9baf6fcba914..f733c5425552 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -17,6 +17,12 @@ #define OPTION_MPTCP_MPC_SYN BIT(0) #define OPTION_MPTCP_MPC_SYNACK BIT(1) #define OPTION_MPTCP_MPC_ACK BIT(2) +#define OPTION_MPTCP_MPJ_SYN BIT(3) +#define OPTION_MPTCP_MPJ_SYNACK BIT(4) +#define OPTION_MPTCP_MPJ_ACK BIT(5) +#define OPTION_MPTCP_ADD_ADDR BIT(6) +#define OPTION_MPTCP_ADD_ADDR6 BIT(7) +#define OPTION_MPTCP_RM_ADDR BIT(8) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -33,12 +39,30 @@ #define TCPOLEN_MPTCP_MPC_SYNACK 12 #define TCPOLEN_MPTCP_MPC_ACK 20 #define TCPOLEN_MPTCP_MPC_ACK_DATA 22 +#define TCPOLEN_MPTCP_MPJ_SYN 12 +#define TCPOLEN_MPTCP_MPJ_SYNACK 16 +#define TCPOLEN_MPTCP_MPJ_ACK 24 #define TCPOLEN_MPTCP_DSS_BASE 4 #define TCPOLEN_MPTCP_DSS_ACK32 4 #define TCPOLEN_MPTCP_DSS_ACK64 8 #define TCPOLEN_MPTCP_DSS_MAP32 10 #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 +#define TCPOLEN_MPTCP_ADD_ADDR 16 +#define TCPOLEN_MPTCP_ADD_ADDR_PORT 18 +#define TCPOLEN_MPTCP_ADD_ADDR_BASE 8 +#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10 +#define TCPOLEN_MPTCP_ADD_ADDR6 28 +#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30 +#define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20 +#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 +#define TCPOLEN_MPTCP_PORT_LEN 2 +#define TCPOLEN_MPTCP_RM_ADDR_BASE 4 + +/* MPTCP MP_JOIN flags */ +#define MPTCPOPT_BACKUP BIT(0) +#define MPTCPOPT_HMAC_LEN 20 +#define MPTCPOPT_THMAC_LEN 8 /* MPTCP MP_CAPABLE flags */ #define MPTCP_VERSION_MASK (0x0F) @@ -55,9 +79,75 @@ #define MPTCP_DSS_HAS_ACK BIT(0) #define MPTCP_DSS_FLAG_MASK (0x1F) +/* MPTCP ADD_ADDR flags */ +#define MPTCP_ADDR_ECHO BIT(0) +#define MPTCP_ADDR_HMAC_LEN 20 +#define MPTCP_ADDR_IPVERSION_4 4 +#define MPTCP_ADDR_IPVERSION_6 6 + /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 #define MPTCP_SEND_SPACE 1 +#define MPTCP_WORK_RTX 2 + +static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) +{ + return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) | + ((nib & 0xF) << 8) | field); +} + +#define MPTCP_PM_MAX_ADDR 4 + +struct mptcp_addr_info { + sa_family_t family; + __be16 port; + u8 id; + union { + struct in_addr addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + struct in6_addr addr6; +#endif + }; +}; + +enum mptcp_pm_status { + MPTCP_PM_ADD_ADDR_RECEIVED, + MPTCP_PM_ESTABLISHED, + MPTCP_PM_SUBFLOW_ESTABLISHED, +}; + +struct mptcp_pm_data { + struct mptcp_addr_info local; + struct mptcp_addr_info remote; + + spinlock_t lock; /*protects the whole PM data */ + + bool addr_signal; + bool server_side; + bool work_pending; + bool accept_addr; + bool accept_subflow; + u8 add_addr_signaled; + u8 add_addr_accepted; + u8 local_addr_used; + u8 subflows; + u8 add_addr_signal_max; + u8 add_addr_accept_max; + u8 local_addr_max; + u8 subflows_max; + u8 status; + + struct work_struct work; +}; + +struct mptcp_data_frag { + struct list_head list; + u64 data_seq; + int data_len; + int offset; + int overhead; + struct page *page; +}; /* MPTCP connection sock */ struct mptcp_sock { @@ -67,14 +157,20 @@ struct mptcp_sock { u64 remote_key; u64 write_seq; u64 ack_seq; + atomic64_t snd_una; + unsigned long timer_ival; u32 token; unsigned long flags; bool can_ack; + spinlock_t join_list_lock; struct work_struct work; struct list_head conn_list; + struct list_head rtx_queue; + struct list_head join_list; struct skb_ext *cached_ext; /* for the next sendmsg */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct sock *first; + struct mptcp_pm_data pm; }; #define mptcp_for_each_subflow(__msk, __subflow) \ @@ -85,17 +181,42 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) return (struct mptcp_sock *)sk; } +static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (list_empty(&msk->rtx_queue)) + return NULL; + + return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); +} + +static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (list_empty(&msk->rtx_queue)) + return NULL; + + return list_first_entry(&msk->rtx_queue, struct mptcp_data_frag, list); +} + struct mptcp_subflow_request_sock { struct tcp_request_sock sk; u16 mp_capable : 1, mp_join : 1, backup : 1, remote_key_valid : 1; + u8 local_id; + u8 remote_id; u64 local_key; u64 remote_key; u64 idsn; u32 token; u32 ssn_offset; + u64 thmac; + u32 local_nonce; + u32 remote_nonce; }; static inline struct mptcp_subflow_request_sock * @@ -118,16 +239,28 @@ struct mptcp_subflow_context { u32 ssn_offset; u32 map_data_len; u32 request_mptcp : 1, /* send MP_CAPABLE */ + request_join : 1, /* send MP_JOIN */ + request_bkup : 1, mp_capable : 1, /* remote is MPTCP capable */ - fourth_ack : 1, /* send initial DSS */ + mp_join : 1, /* remote is JOINing */ + fully_established : 1, /* path validated */ + pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, mpc_map : 1, + backup : 1, data_avail : 1, rx_eof : 1, data_fin_tx_enable : 1, can_ack : 1; /* only after processing the remote a key */ u64 data_fin_tx_seq; + u32 remote_nonce; + u64 thmac; + u32 local_nonce; + u32 remote_token; + u8 hmac[MPTCPOPT_HMAC_LEN]; + u8 local_id; + u8 remote_id; struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ @@ -171,6 +304,11 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) int mptcp_is_enabled(struct net *net); bool mptcp_subflow_data_available(struct sock *sk); void mptcp_subflow_init(void); + +/* called with sk socket lock held */ +int __mptcp_subflow_connect(struct sock *sk, int ifindex, + const struct mptcp_addr_info *loc, + const struct mptcp_addr_info *remote); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); static inline void mptcp_subflow_tcp_fallback(struct sock *sk, @@ -199,11 +337,14 @@ void mptcp_get_options(const struct sk_buff *skb, void mptcp_finish_connect(struct sock *sk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); +bool mptcp_finish_join(struct sock *sk); +void mptcp_data_acked(struct sock *sk); int mptcp_token_new_request(struct request_sock *req); void mptcp_token_destroy_request(u32 token); int mptcp_token_new_connect(struct sock *sk); int mptcp_token_new_accept(u32 token, struct sock *conn); +struct mptcp_sock *mptcp_token_get_sock(u32 token); void mptcp_token_destroy(u32 token); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); @@ -219,8 +360,48 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) mptcp_crypto_key_sha(*key, token, idsn); } -void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, - void *hash_out); +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); + +void mptcp_pm_init(void); +void mptcp_pm_data_init(struct mptcp_sock *msk); +void mptcp_pm_close(struct mptcp_sock *msk); +void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side); +void mptcp_pm_fully_established(struct mptcp_sock *msk); +bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); +void mptcp_pm_connection_closed(struct mptcp_sock *msk); +void mptcp_pm_subflow_established(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow); +void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id); +void mptcp_pm_add_addr_received(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr); + +int mptcp_pm_announce_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr); +int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id); +int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id); + +static inline bool mptcp_pm_should_signal(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.addr_signal); +} + +static inline unsigned int mptcp_add_addr_len(int family) +{ + if (family == AF_INET) + return TCPOLEN_MPTCP_ADD_ADDR; + return TCPOLEN_MPTCP_ADD_ADDR6; +} + +bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, + struct mptcp_addr_info *saddr); +int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); + +void mptcp_pm_nl_init(void); +void mptcp_pm_nl_data_init(struct mptcp_sock *msk); +void mptcp_pm_nl_fully_established(struct mptcp_sock *msk); +void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk); +void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk); +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) { @@ -234,4 +415,6 @@ static inline bool before64(__u64 seq1, __u64 seq2) #define after64(seq2, seq1) before64(seq1, seq2) +void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 8434c7f5f712..b5180c81588e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> +#include <crypto/algapi.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> @@ -19,17 +20,42 @@ #endif #include <net/mptcp.h> #include "protocol.h" +#include "mib.h" + +static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, + enum linux_mptcp_mib_field field) +{ + MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); +} static int subflow_rebuild_header(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - int err = 0; + int local_id, err = 0; if (subflow->request_mptcp && !subflow->token) { pr_debug("subflow=%p", sk); err = mptcp_token_new_connect(sk); + } else if (subflow->request_join && !subflow->local_nonce) { + struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn; + + pr_debug("subflow=%p", sk); + + do { + get_random_bytes(&subflow->local_nonce, sizeof(u32)); + } while (!subflow->local_nonce); + + if (subflow->local_id) + goto out; + + local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); + if (local_id < 0) + return -EINVAL; + + subflow->local_id = local_id; } +out: if (err) return err; @@ -47,6 +73,51 @@ static void subflow_req_destructor(struct request_sock *req) tcp_request_sock_ops.destructor(req); } +static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, + void *hmac) +{ + u8 msg[8]; + + put_unaligned_be32(nonce1, &msg[0]); + put_unaligned_be32(nonce2, &msg[4]); + + mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); +} + +/* validate received token and create truncated hmac and nonce for SYN-ACK */ +static bool subflow_token_join_request(struct request_sock *req, + const struct sk_buff *skb) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + u8 hmac[MPTCPOPT_HMAC_LEN]; + struct mptcp_sock *msk; + int local_id; + + msk = mptcp_token_get_sock(subflow_req->token); + if (!msk) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); + return false; + } + + local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); + if (local_id < 0) { + sock_put((struct sock *)msk); + return false; + } + subflow_req->local_id = local_id; + + get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); + + subflow_generate_hmac(msk->local_key, msk->remote_key, + subflow_req->local_nonce, + subflow_req->remote_nonce, hmac); + + subflow_req->thmac = get_unaligned_be64(hmac); + + sock_put((struct sock *)msk); + return true; +} + static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) @@ -61,6 +132,7 @@ static void subflow_init_req(struct request_sock *req, mptcp_get_options(skb, &rx_opt); subflow_req->mp_capable = 0; + subflow_req->mp_join = 0; subflow_req->remote_key_valid = 0; #ifdef CONFIG_TCP_MD5SIG @@ -71,6 +143,15 @@ static void subflow_init_req(struct request_sock *req, return; #endif + if (rx_opt.mptcp.mp_capable) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); + + if (rx_opt.mptcp.mp_join) + return; + } else if (rx_opt.mptcp.mp_join) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); + } + if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { int err; @@ -79,6 +160,19 @@ static void subflow_init_req(struct request_sock *req, subflow_req->mp_capable = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; + } else if (rx_opt.mptcp.mp_join && listener->request_mptcp) { + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; + subflow_req->mp_join = 1; + subflow_req->backup = rx_opt.mptcp.backup; + subflow_req->remote_id = rx_opt.mptcp.join_id; + subflow_req->token = rx_opt.mptcp.token; + subflow_req->remote_nonce = rx_opt.mptcp.nonce; + pr_debug("token=%u, remote_nonce=%u", subflow_req->token, + subflow_req->remote_nonce); + if (!subflow_token_join_request(req, skb)) { + subflow_req->mp_join = 0; + // @@ need to trigger RST + } } } @@ -106,13 +200,41 @@ static void subflow_v6_init_req(struct request_sock *req, } #endif +/* validate received truncated hmac and create hmac for third ACK */ +static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) +{ + u8 hmac[MPTCPOPT_HMAC_LEN]; + u64 thmac; + + subflow_generate_hmac(subflow->remote_key, subflow->local_key, + subflow->remote_nonce, subflow->local_nonce, + hmac); + + thmac = get_unaligned_be64(hmac); + pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", + subflow, subflow->token, + (unsigned long long)thmac, + (unsigned long long)subflow->thmac); + + return thmac == subflow->thmac; +} + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct sock *parent = subflow->conn; subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); - if (!subflow->conn_finished) { + if (inet_sk_state_load(parent) != TCP_ESTABLISHED) { + inet_sk_state_store(parent, TCP_ESTABLISHED); + parent->sk_state_change(parent); + } + + if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp) + return; + + if (subflow->mp_capable) { pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), subflow->remote_key); mptcp_finish_connect(sk); @@ -122,6 +244,33 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); subflow->ssn_offset = TCP_SKB_CB(skb)->seq; } + } else if (subflow->mp_join) { + pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", + subflow, subflow->thmac, + subflow->remote_nonce); + if (!subflow_thmac_valid(subflow)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); + subflow->mp_join = 0; + goto do_reset; + } + + subflow_generate_hmac(subflow->local_key, subflow->remote_key, + subflow->local_nonce, + subflow->remote_nonce, + subflow->hmac); + + if (skb) + subflow->ssn_offset = TCP_SKB_CB(skb)->seq; + + if (!mptcp_finish_join(sk)) + goto do_reset; + + subflow->conn_finished = 1; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); + } else { +do_reset: + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_done(sk); } } @@ -172,6 +321,32 @@ drop: } #endif +/* validate hmac received in third ACK */ +static bool subflow_hmac_valid(const struct request_sock *req, + const struct tcp_options_received *rx_opt) +{ + const struct mptcp_subflow_request_sock *subflow_req; + u8 hmac[MPTCPOPT_HMAC_LEN]; + struct mptcp_sock *msk; + bool ret; + + subflow_req = mptcp_subflow_rsk(req); + msk = mptcp_token_get_sock(subflow_req->token); + if (!msk) + return false; + + subflow_generate_hmac(msk->remote_key, msk->local_key, + subflow_req->remote_nonce, + subflow_req->local_nonce, hmac); + + ret = true; + if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac))) + ret = false; + + sock_put((struct sock *)msk); + return ret; +} + static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -182,6 +357,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); struct mptcp_subflow_request_sock *subflow_req; struct tcp_options_received opt_rx; + bool fallback_is_fatal = false; struct sock *new_msk = NULL; struct sock *child; @@ -215,6 +391,15 @@ create_msk: new_msk = mptcp_sk_clone(listener->conn, req); if (!new_msk) subflow_req->mp_capable = 0; + } else if (subflow_req->mp_join) { + fallback_is_fatal = true; + opt_rx.mptcp.mp_join = 0; + mptcp_get_options(skb, &opt_rx); + if (!opt_rx.mptcp.mp_join || + !subflow_hmac_valid(req, &opt_rx)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); + return NULL; + } } create_child: @@ -224,18 +409,35 @@ create_child: if (child && *own_req) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); - /* we have null ctx on TCP fallback, not fatal on MPC - * handshake + /* we have null ctx on TCP fallback, which is fatal on + * MPJ handshake */ - if (!ctx) + if (!ctx) { + if (fallback_is_fatal) + goto close_child; goto out; + } if (ctx->mp_capable) { /* new mpc subflow takes ownership of the newly * created mptcp socket */ + inet_sk_state_store(new_msk, TCP_ESTABLISHED); + mptcp_pm_new_connection(mptcp_sk(new_msk), 1); ctx->conn = new_msk; new_msk = NULL; + } else if (ctx->mp_join) { + struct mptcp_sock *owner; + + owner = mptcp_token_get_sock(ctx->token); + if (!owner) + goto close_child; + + ctx->conn = (struct sock *)owner; + if (!mptcp_finish_join(child)) + goto close_child; + + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); } } @@ -244,6 +446,12 @@ out: if (unlikely(new_msk)) sock_put(new_msk); return child; + +close_child: + tcp_send_active_reset(child, GFP_ATOMIC); + inet_csk_prepare_forced_close(child); + tcp_done(child); + return NULL; } static struct inet_connection_sock_af_ops subflow_specific; @@ -345,6 +553,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk) data_len = mpext->data_len; if (data_len == 0) { pr_err("Infinite mapping not handled"); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); return MAPPING_INVALID; } @@ -388,8 +597,10 @@ static enum mapping_status get_mapping_status(struct sock *ssk) /* If this skb data are fully covered by the current mapping, * the new map would need caching, which is not supported */ - if (skb_is_fully_mapped(ssk, skb)) + if (skb_is_fully_mapped(ssk, skb)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); return MAPPING_INVALID; + } /* will validate the next map after consuming the current one */ return MAPPING_OK; @@ -558,7 +769,7 @@ static void subflow_data_ready(struct sock *sk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; - if (!subflow->mp_capable) { + if (!subflow->mp_capable && !subflow->mp_join) { subflow->tcp_data_ready(sk); parent->sk_data_ready(parent); @@ -613,6 +824,85 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped) } #endif +static void mptcp_info2sockaddr(const struct mptcp_addr_info *info, + struct sockaddr_storage *addr) +{ + memset(addr, 0, sizeof(*addr)); + addr->ss_family = info->family; + if (addr->ss_family == AF_INET) { + struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; + + in_addr->sin_addr = info->addr; + in_addr->sin_port = info->port; + } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->ss_family == AF_INET6) { + struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; + + in6_addr->sin6_addr = info->addr6; + in6_addr->sin6_port = info->port; + } +#endif +} + +int __mptcp_subflow_connect(struct sock *sk, int ifindex, + const struct mptcp_addr_info *loc, + const struct mptcp_addr_info *remote) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_subflow_context *subflow; + struct sockaddr_storage addr; + struct socket *sf; + u32 remote_token; + int addrlen; + int err; + + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + err = mptcp_subflow_create_socket(sk, &sf); + if (err) + return err; + + subflow = mptcp_subflow_ctx(sf->sk); + subflow->remote_key = msk->remote_key; + subflow->local_key = msk->local_key; + subflow->token = msk->token; + mptcp_info2sockaddr(loc, &addr); + + addrlen = sizeof(struct sockaddr_in); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (loc->family == AF_INET6) + addrlen = sizeof(struct sockaddr_in6); +#endif + sf->sk->sk_bound_dev_if = ifindex; + err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); + if (err) + goto failed; + + mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); + pr_debug("msk=%p remote_token=%u", msk, remote_token); + subflow->remote_token = remote_token; + subflow->local_id = loc->id; + subflow->request_join = 1; + subflow->request_bkup = 1; + mptcp_info2sockaddr(remote, &addr); + + err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); + if (err && err != -EINPROGRESS) + goto failed; + + spin_lock_bh(&msk->join_list_lock); + list_add_tail(&subflow->node, &msk->join_list); + spin_unlock_bh(&msk->join_list_lock); + + return err; + +failed: + sock_release(sf); + return err; +} + int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) { struct mptcp_subflow_context *subflow; @@ -777,7 +1067,8 @@ static void subflow_ulp_clone(const struct request_sock *req, struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); struct mptcp_subflow_context *new_ctx; - if (!tcp_rsk(req)->is_mptcp || !subflow_req->mp_capable) { + if (!tcp_rsk(req)->is_mptcp || + (!subflow_req->mp_capable && !subflow_req->mp_join)) { subflow_ulp_fallback(newsk, old_ctx); return; } @@ -788,9 +1079,6 @@ static void subflow_ulp_clone(const struct request_sock *req, return; } - /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully - * established only after we receive the remote key - */ new_ctx->conn_finished = 1; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; @@ -799,14 +1087,27 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->rel_write_seq = 1; new_ctx->tcp_sock = newsk; - new_ctx->mp_capable = 1; - new_ctx->fourth_ack = subflow_req->remote_key_valid; - new_ctx->can_ack = subflow_req->remote_key_valid; - new_ctx->remote_key = subflow_req->remote_key; - new_ctx->local_key = subflow_req->local_key; - new_ctx->token = subflow_req->token; - new_ctx->ssn_offset = subflow_req->ssn_offset; - new_ctx->idsn = subflow_req->idsn; + if (subflow_req->mp_capable) { + /* see comments in subflow_syn_recv_sock(), MPTCP connection + * is fully established only after we receive the remote key + */ + new_ctx->mp_capable = 1; + new_ctx->fully_established = subflow_req->remote_key_valid; + new_ctx->can_ack = subflow_req->remote_key_valid; + new_ctx->remote_key = subflow_req->remote_key; + new_ctx->local_key = subflow_req->local_key; + new_ctx->token = subflow_req->token; + new_ctx->ssn_offset = subflow_req->ssn_offset; + new_ctx->idsn = subflow_req->idsn; + } else if (subflow_req->mp_join) { + new_ctx->ssn_offset = subflow_req->ssn_offset; + new_ctx->mp_join = 1; + new_ctx->fully_established = 1; + new_ctx->backup = subflow_req->backup; + new_ctx->local_id = subflow_req->local_id; + new_ctx->token = subflow_req->token; + new_ctx->thmac = subflow_req->thmac; + } } static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { @@ -868,6 +1169,8 @@ void mptcp_subflow_init(void) subflow_v6m_specific.net_frag_header_len = 0; #endif + mptcp_diag_subflow_init(&subflow_ulp_ops); + if (tcp_register_ulp(&subflow_ulp_ops) != 0) panic("MPTCP: failed to register subflows to ULP\n"); } diff --git a/net/mptcp/token.c b/net/mptcp/token.c index b71b53c0ac8d..129a5ad1bc35 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -141,6 +141,33 @@ int mptcp_token_new_accept(u32 token, struct sock *conn) } /** + * mptcp_token_get_sock - retrieve mptcp connection sock using its token + * @token: token of the mptcp connection to retrieve + * + * This function returns the mptcp connection structure with the given token. + * A reference count on the mptcp socket returned is taken. + * + * returns NULL if no connection with the given token value exists. + */ +struct mptcp_sock *mptcp_token_get_sock(u32 token) +{ + struct sock *conn; + + spin_lock_bh(&token_tree_lock); + conn = radix_tree_lookup(&token_tree, token); + if (conn) { + /* token still reserved? */ + if (conn == (struct sock *)&token_used) + conn = NULL; + else + sock_hold(conn); + } + spin_unlock_bh(&token_tree_lock); + + return mptcp_sk(conn); +} + +/** * mptcp_token_destroy_request - remove mptcp connection/token * @token - token of mptcp connection to remove * diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 91efae88e8c2..468fea1aebba 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -455,14 +455,6 @@ config NF_TABLES To compile it as a module, choose M here. if NF_TABLES - -config NF_TABLES_SET - tristate "Netfilter nf_tables set infrastructure" - help - This option enables the nf_tables set infrastructure that allows to - look up for elements in a set and to build one-way mappings between - matchings and actions. - config NF_TABLES_INET depends on IPV6 select NF_TABLES_IPV4 diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 3f572e5a975e..292e71dc7ba4 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -78,14 +78,17 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o \ - nft_chain_route.o nf_tables_offload.o + nft_chain_route.o nf_tables_offload.o \ + nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ + nft_set_pipapo.o -nf_tables_set-objs := nf_tables_set_core.o \ - nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ - nft_set_pipapo.o +ifdef CONFIG_X86_64 +ifneq (,$(findstring -DCONFIG_AS_AVX2=1,$(KBUILD_CFLAGS))) +nf_tables-objs += nft_set_pipapo_avx2.o +endif +endif obj-$(CONFIG_NF_TABLES) += nf_tables.o -obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 0a2196f59106..486959f70cf3 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -46,7 +46,7 @@ struct bitmap_ip { u8 netmask; /* subnet netmask */ struct timer_list gc; /* garbage collection */ struct ip_set *set; /* attached to this ip_set */ - unsigned char extensions[0] /* data extensions */ + unsigned char extensions[] /* data extensions */ __aligned(__alignof__(u64)); }; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 739e343efaf6..2310a316e0af 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -49,7 +49,7 @@ struct bitmap_ipmac { size_t memsize; /* members size */ struct timer_list gc; /* garbage collector */ struct ip_set *set; /* attached to this ip_set */ - unsigned char extensions[0] /* MAC + data extensions */ + unsigned char extensions[] /* MAC + data extensions */ __aligned(__alignof__(u64)); }; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index b49978dd810d..e56ced66f202 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -37,7 +37,7 @@ struct bitmap_port { size_t memsize; /* members size */ struct timer_list gc; /* garbage collection */ struct ip_set *set; /* attached to this ip_set */ - unsigned char extensions[0] /* data extensions */ + unsigned char extensions[] /* data extensions */ __aligned(__alignof__(u64)); }; diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index e52d7b7597a0..1ee43752d6d3 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -76,7 +76,7 @@ struct hbucket { DECLARE_BITMAP(used, AHASH_MAX_TUNED); u8 size; /* size of the array */ u8 pos; /* position of the first free entry */ - unsigned char value[0] /* the array of the values */ + unsigned char value[] /* the array of the values */ __aligned(__alignof__(u64)); }; @@ -109,7 +109,7 @@ struct htable { u8 htable_bits; /* size of hash table == 2^htable_bits */ u32 maxelem; /* Maxelem per region */ struct ip_set_region *hregion; /* Region locks and ext sizes */ - struct hbucket __rcu *bucket[0]; /* hashtable buckets */ + struct hbucket __rcu *bucket[]; /* hashtable buckets */ }; #define hbucket(h, i) ((h)->bucket[i]) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 512259f579d7..aa6a603a2425 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1661,8 +1661,9 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; unsigned int offset, offset2, ihl, verdict; - bool ipip, new_cp = false; + bool tunnel, new_cp = false; union nf_inet_addr *raddr; + char *outer_proto = "IPIP"; *related = 1; @@ -1703,8 +1704,8 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, return NF_ACCEPT; /* The packet looks wrong, ignore */ raddr = (union nf_inet_addr *)&cih->daddr; - /* Special case for errors for IPIP packets */ - ipip = false; + /* Special case for errors for IPIP/UDP/GRE tunnel packets */ + tunnel = false; if (cih->protocol == IPPROTO_IPIP) { struct ip_vs_dest *dest; @@ -1721,7 +1722,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - ipip = true; + tunnel = true; } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */ cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */ /* Error for our tunnel must arrive at LOCAL_IN */ @@ -1729,16 +1730,19 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, __u8 iproto; int ulen; - /* Non-first fragment has no UDP header */ + /* Non-first fragment has no UDP/GRE header */ if (unlikely(cih->frag_off & htons(IP_OFFSET))) return NF_ACCEPT; offset2 = offset + cih->ihl * 4; - if (cih->protocol == IPPROTO_UDP) + if (cih->protocol == IPPROTO_UDP) { ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, raddr, &iproto); - else + outer_proto = "UDP"; + } else { ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET, raddr, &iproto); + outer_proto = "GRE"; + } if (ulen > 0) { /* Skip IP and UDP/GRE tunnel headers */ offset = offset2 + ulen; @@ -1747,7 +1751,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, &_ciph); if (cih && cih->version == 4 && cih->ihl >= 5 && iproto == IPPROTO_IPIP) - ipip = true; + tunnel = true; else return NF_ACCEPT; } @@ -1767,11 +1771,11 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, "Checking incoming ICMP for"); offset2 = offset; - ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !ipip, &ciph); + ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph); offset = ciph.len; /* The embedded headers contain source and dest in reverse order. - * For IPIP this is error for request, not for reply. + * For IPIP/UDP/GRE tunnel this is error for request, not for reply. */ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, AF_INET, skb, &ciph); @@ -1779,7 +1783,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, if (!cp) { int v; - if (ipip || !sysctl_schedule_icmp(ipvs)) + if (tunnel || !sysctl_schedule_icmp(ipvs)) return NF_ACCEPT; if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) @@ -1797,7 +1801,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, goto out; } - if (ipip) { + if (tunnel) { __be32 info = ic->un.gateway; __u8 type = ic->type; __u8 code = ic->code; @@ -1809,17 +1813,18 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, u32 mtu = ntohs(ic->un.frag.mtu); __be16 frag_off = cih->frag_off; - /* Strip outer IP and ICMP, go to IPIP header */ + /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */ if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) - goto ignore_ipip; + goto ignore_tunnel; offset2 -= ihl + sizeof(_icmph); skb_reset_network_header(skb); - IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n", - &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu); + IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n", + outer_proto, &ip_hdr(skb)->saddr, + &ip_hdr(skb)->daddr, mtu); ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0); /* Client uses PMTUD? */ if (!(frag_off & htons(IP_DF))) - goto ignore_ipip; + goto ignore_tunnel; /* Prefer the resulting PMTU */ if (dest) { struct ip_vs_dest_dst *dest_dst; @@ -1832,11 +1837,11 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, mtu -= sizeof(struct iphdr); info = htonl(mtu); } - /* Strip outer IP, ICMP and IPIP, go to IP header of + /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of * original request. */ if (pskb_pull(skb, offset2) == NULL) - goto ignore_ipip; + goto ignore_tunnel; skb_reset_network_header(skb); IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, @@ -1845,7 +1850,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, /* ICMP can be shorter but anyways, account it */ ip_vs_out_stats(cp, skb); -ignore_ipip: +ignore_tunnel: consume_skb(skb); verdict = NF_STOLEN; goto out; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1927fc296f95..c4582eb71766 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -143,6 +143,7 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, } static void nf_conntrack_all_lock(void) + __acquires(&nf_conntrack_locks_all_lock) { int i; @@ -162,6 +163,7 @@ static void nf_conntrack_all_lock(void) } static void nf_conntrack_all_unlock(void) + __releases(&nf_conntrack_locks_all_lock) { /* All prior stores must be complete before we clear * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() @@ -863,9 +865,8 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); -static inline void nf_ct_acct_update(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int len) +void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, + unsigned int bytes) { struct nf_conn_acct *acct; @@ -873,10 +874,11 @@ static inline void nf_ct_acct_update(struct nf_conn *ct, if (acct) { struct nf_conn_counter *counter = acct->counter; - atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); - atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes); + atomic64_add(packets, &counter[dir].packets); + atomic64_add(bytes, &counter[dir].bytes); } } +EXPORT_SYMBOL_GPL(nf_ct_acct_add); static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_conn *loser_ct) @@ -890,7 +892,7 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, /* u32 should be fine since we must have seen one packet. */ bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); - nf_ct_acct_update(ct, ctinfo, bytes); + nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); } } @@ -1931,7 +1933,7 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, WRITE_ONCE(ct->timeout, extra_jiffies); acct: if (do_acct) - nf_ct_acct_update(ct, ctinfo, skb->len); + nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); @@ -1939,7 +1941,7 @@ bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb) { - nf_ct_acct_update(ct, ctinfo, skb->len); + nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); return nf_ct_delete(ct, 0, 0); } @@ -2633,7 +2635,6 @@ void nf_conntrack_init_end(void) */ #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) #define DYING_NULLS_VAL ((1<<30)+1) -#define TEMPLATE_NULLS_VAL ((1<<30)+2) int nf_conntrack_init_net(struct net *net) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 6a1c8f1f6171..9ddfcd002d3b 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -860,7 +860,7 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) struct ctnetlink_filter *filter; #ifndef CONFIG_NF_CONNTRACK_MARK - if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) + if (cda[CTA_MARK] || cda[CTA_MARK_MASK]) return ERR_PTR(-EOPNOTSUPP); #endif @@ -1533,6 +1533,7 @@ static int ctnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) + __must_hold(RCU) { struct nf_nat_hook *nat_hook; int err; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 4912069627b6..9b57330c81f8 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1054,21 +1054,18 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) nf_conntrack_standalone_init_dccp_sysctl(net, table); nf_conntrack_standalone_init_gre_sysctl(net, table); - /* Don't export sysctls to unprivileged users */ + /* Don't allow unprivileged users to alter certain sysctls */ if (net->user_ns != &init_user_ns) { - table[NF_SYSCTL_CT_MAX].procname = NULL; - table[NF_SYSCTL_CT_ACCT].procname = NULL; - table[NF_SYSCTL_CT_HELPER].procname = NULL; -#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP - table[NF_SYSCTL_CT_TIMESTAMP].procname = NULL; -#endif + table[NF_SYSCTL_CT_MAX].mode = 0444; + table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444; + table[NF_SYSCTL_CT_HELPER].mode = 0444; #ifdef CONFIG_NF_CONNTRACK_EVENTS - table[NF_SYSCTL_CT_EVENTS].procname = NULL; + table[NF_SYSCTL_CT_EVENTS].mode = 0444; #endif - } - - if (!net_eq(&init_net, net)) table[NF_SYSCTL_CT_BUCKETS].mode = 0444; + } else if (!net_eq(&init_net, net)) { + table[NF_SYSCTL_CT_BUCKETS].mode = 0444; + } net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!net->ct.sysctl_header) diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 9a477bd563b7..c0cb79495c35 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -392,7 +392,7 @@ int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, struct flow_block_cb *block_cb; int err = 0; - mutex_lock(&flow_table->flow_block_lock); + down_write(&flow_table->flow_block_lock); block_cb = flow_block_cb_lookup(block, cb, cb_priv); if (block_cb) { err = -EEXIST; @@ -408,7 +408,7 @@ int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, list_add_tail(&block_cb->list, &block->cb_list); unlock: - mutex_unlock(&flow_table->flow_block_lock); + up_write(&flow_table->flow_block_lock); return err; } EXPORT_SYMBOL_GPL(nf_flow_table_offload_add_cb); @@ -419,13 +419,13 @@ void nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table, struct flow_block *block = &flow_table->flow_block; struct flow_block_cb *block_cb; - mutex_lock(&flow_table->flow_block_lock); + down_write(&flow_table->flow_block_lock); block_cb = flow_block_cb_lookup(block, cb, cb_priv); if (block_cb) list_del(&block_cb->list); else WARN_ON(true); - mutex_unlock(&flow_table->flow_block_lock); + up_write(&flow_table->flow_block_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_offload_del_cb); @@ -551,7 +551,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable) INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); flow_block_init(&flowtable->flow_block); - mutex_init(&flowtable->flow_block_lock); + init_rwsem(&flowtable->flow_block_lock); err = rhashtable_init(&flowtable->rhashtable, &nf_flow_offload_rhash_params); @@ -613,8 +613,10 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); nf_flow_table_offload_flush(flow_table); + if (nf_flowtable_hw_offload(flow_table)) + nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, + flow_table); rhashtable_destroy(&flow_table->rhashtable); - mutex_destroy(&flow_table->flow_block_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_free); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 5272721080f8..a3bca758b849 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -12,6 +12,7 @@ #include <net/ip6_route.h> #include <net/neighbour.h> #include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_conntrack_acct.h> /* For layer 4 checksum field offset. */ #include <linux/tcp.h> #include <linux/udp.h> @@ -146,11 +147,13 @@ static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) + nf_flow_snat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0)) return -1; + + iph = ip_hdr(skb); if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) + nf_flow_dnat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0)) return -1; return 0; @@ -189,6 +192,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, if (!pskb_may_pull(skb, thoff + sizeof(*ports))) return -1; + iph = ip_hdr(skb); ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_v4.s_addr = iph->saddr; @@ -286,6 +290,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, ip_decrease_ttl(iph); skb->tstamp = 0; + if (flow_table->flags & NF_FLOWTABLE_COUNTER) + nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); + if (unlikely(dst_xfrm(&rt->dst))) { memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; @@ -417,11 +424,13 @@ static int nf_flow_nat_ipv6(const struct flow_offload *flow, if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + nf_flow_snat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0)) return -1; + + ip6h = ipv6_hdr(skb); if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + nf_flow_dnat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0)) return -1; return 0; @@ -450,6 +459,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, if (!pskb_may_pull(skb, thoff + sizeof(*ports))) return -1; + ip6h = ipv6_hdr(skb); ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_v6 = ip6h->saddr; @@ -516,6 +526,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, ip6h->hop_limit--; skb->tstamp = 0; + if (flow_table->flags & NF_FLOWTABLE_COUNTER) + nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); + if (unlikely(dst_xfrm(&rt->dst))) { memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->iif = skb->dev->ifindex; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 42b73a084a63..e3b099c14eff 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -7,13 +7,13 @@ #include <linux/tc_act/tc_csum.h> #include <net/flow_offload.h> #include <net/netfilter/nf_flow_table.h> +#include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_tuple.h> -static struct work_struct nf_flow_offload_work; -static DEFINE_SPINLOCK(flow_offload_pending_list_lock); -static LIST_HEAD(flow_offload_pending_list); +static struct workqueue_struct *nf_flow_offload_wq; struct flow_offload_work { struct list_head list; @@ -21,17 +21,68 @@ struct flow_offload_work { int priority; struct nf_flowtable *flowtable; struct flow_offload *flow; + struct work_struct work; }; #define NF_FLOW_DISSECTOR(__match, __type, __field) \ (__match)->dissector.offset[__type] = \ offsetof(struct nf_flow_key, __field) +static void nf_flow_rule_lwt_match(struct nf_flow_match *match, + struct ip_tunnel_info *tun_info) +{ + struct nf_flow_key *mask = &match->mask; + struct nf_flow_key *key = &match->key; + unsigned int enc_keys; + + if (!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)) + return; + + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control); + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); + key->enc_key_id.keyid = tunnel_id_to_key32(tun_info->key.tun_id); + mask->enc_key_id.keyid = 0xffffffff; + enc_keys = BIT(FLOW_DISSECTOR_KEY_ENC_KEYID) | + BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL); + + if (ip_tunnel_info_af(tun_info) == AF_INET) { + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, + enc_ipv4); + key->enc_ipv4.src = tun_info->key.u.ipv4.dst; + key->enc_ipv4.dst = tun_info->key.u.ipv4.src; + if (key->enc_ipv4.src) + mask->enc_ipv4.src = 0xffffffff; + if (key->enc_ipv4.dst) + mask->enc_ipv4.dst = 0xffffffff; + enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS); + key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } else { + memcpy(&key->enc_ipv6.src, &tun_info->key.u.ipv6.dst, + sizeof(struct in6_addr)); + memcpy(&key->enc_ipv6.dst, &tun_info->key.u.ipv6.src, + sizeof(struct in6_addr)); + if (memcmp(&key->enc_ipv6.src, &in6addr_any, + sizeof(struct in6_addr))) + memset(&key->enc_ipv6.src, 0xff, + sizeof(struct in6_addr)); + if (memcmp(&key->enc_ipv6.dst, &in6addr_any, + sizeof(struct in6_addr))) + memset(&key->enc_ipv6.dst, 0xff, + sizeof(struct in6_addr)); + enc_keys |= BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS); + key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } + + match->dissector.used_keys |= enc_keys; +} + static int nf_flow_rule_match(struct nf_flow_match *match, - const struct flow_offload_tuple *tuple) + const struct flow_offload_tuple *tuple, + struct dst_entry *other_dst) { struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; + struct ip_tunnel_info *tun_info; NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); @@ -41,6 +92,11 @@ static int nf_flow_rule_match(struct nf_flow_match *match, NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp); + if (other_dst && other_dst->lwtstate) { + tun_info = lwt_tun_info(other_dst->lwtstate); + nf_flow_rule_lwt_match(match, tun_info); + } + key->meta.ingress_ifindex = tuple->iifidx; mask->meta.ingress_ifindex = 0xffffffff; @@ -64,6 +120,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, default: return -EOPNOTSUPP; } + mask->control.addr_type = 0xffff; match->dissector.used_keys |= BIT(key->control.addr_type); mask->basic.n_proto = 0xffff; @@ -419,10 +476,52 @@ static void flow_offload_redirect(const struct flow_offload *flow, dev_hold(rt->dst.dev); } +static void flow_offload_encap_tunnel(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry; + struct dst_entry *dst; + + dst = flow->tuplehash[dir].tuple.dst_cache; + if (dst && dst->lwtstate) { + struct ip_tunnel_info *tun_info; + + tun_info = lwt_tun_info(dst->lwtstate); + if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) { + entry = flow_action_entry_next(flow_rule); + entry->id = FLOW_ACTION_TUNNEL_ENCAP; + entry->tunnel = tun_info; + } + } +} + +static void flow_offload_decap_tunnel(const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + struct flow_action_entry *entry; + struct dst_entry *dst; + + dst = flow->tuplehash[!dir].tuple.dst_cache; + if (dst && dst->lwtstate) { + struct ip_tunnel_info *tun_info; + + tun_info = lwt_tun_info(dst->lwtstate); + if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX)) { + entry = flow_action_entry_next(flow_rule); + entry->id = FLOW_ACTION_TUNNEL_DECAP; + } + } +} + int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { + flow_offload_decap_tunnel(flow, dir, flow_rule); + flow_offload_encap_tunnel(flow, dir, flow_rule); + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; @@ -449,6 +548,9 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { + flow_offload_decap_tunnel(flow, dir, flow_rule); + flow_offload_encap_tunnel(flow, dir, flow_rule); + if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; @@ -479,6 +581,7 @@ nf_flow_offload_rule_alloc(struct net *net, const struct flow_offload *flow = offload->flow; const struct flow_offload_tuple *tuple; struct nf_flow_rule *flow_rule; + struct dst_entry *other_dst; int err = -ENOMEM; flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL); @@ -494,7 +597,8 @@ nf_flow_offload_rule_alloc(struct net *net, flow_rule->rule->match.key = &flow_rule->match.key; tuple = &flow->tuplehash[dir].tuple; - err = nf_flow_rule_match(&flow_rule->match, tuple); + other_dst = flow->tuplehash[!dir].tuple.dst_cache; + err = nf_flow_rule_match(&flow_rule->match, tuple, other_dst); if (err < 0) goto err_flow_match; @@ -574,6 +678,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, struct nf_flow_rule *flow_rule, enum flow_offload_tuple_dir dir, int priority, int cmd, + struct flow_stats *stats, struct list_head *block_cb_list) { struct flow_cls_offload cls_flow = {}; @@ -587,7 +692,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, if (cmd == FLOW_CLS_REPLACE) cls_flow.rule = flow_rule->rule; - mutex_lock(&flowtable->flow_block_lock); + down_read(&flowtable->flow_block_lock); list_for_each_entry(block_cb, block_cb_list, list) { err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); @@ -596,7 +701,10 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, i++; } - mutex_unlock(&flowtable->flow_block_lock); + up_read(&flowtable->flow_block_lock); + + if (cmd == FLOW_CLS_STATS) + memcpy(stats, &cls_flow.stats, sizeof(*stats)); return i; } @@ -607,7 +715,7 @@ static int flow_offload_tuple_add(struct flow_offload_work *offload, { return nf_flow_offload_tuple(offload->flowtable, offload->flow, flow_rule, dir, offload->priority, - FLOW_CLS_REPLACE, + FLOW_CLS_REPLACE, NULL, &offload->flowtable->flow_block.cb_list); } @@ -615,7 +723,7 @@ static void flow_offload_tuple_del(struct flow_offload_work *offload, enum flow_offload_tuple_dir dir) { nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, - offload->priority, FLOW_CLS_DESTROY, + offload->priority, FLOW_CLS_DESTROY, NULL, &offload->flowtable->flow_block.cb_list); } @@ -661,21 +769,9 @@ static void flow_offload_tuple_stats(struct flow_offload_work *offload, enum flow_offload_tuple_dir dir, struct flow_stats *stats) { - struct nf_flowtable *flowtable = offload->flowtable; - struct flow_cls_offload cls_flow = {}; - struct flow_block_cb *block_cb; - struct netlink_ext_ack extack; - __be16 proto = ETH_P_ALL; - - nf_flow_offload_init(&cls_flow, proto, offload->priority, - FLOW_CLS_STATS, - &offload->flow->tuplehash[dir].tuple, &extack); - - mutex_lock(&flowtable->flow_block_lock); - list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) - block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); - mutex_unlock(&flowtable->flow_block_lock); - memcpy(stats, &cls_flow.stats, sizeof(*stats)); + nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, + offload->priority, FLOW_CLS_STATS, stats, + &offload->flowtable->flow_block.cb_list); } static void flow_offload_work_stats(struct flow_offload_work *offload) @@ -689,19 +785,25 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) lastused = max_t(u64, stats[0].lastused, stats[1].lastused); offload->flow->timeout = max_t(u64, offload->flow->timeout, lastused + NF_FLOW_TIMEOUT); + + if (offload->flowtable->flags & NF_FLOWTABLE_COUNTER) { + if (stats[0].pkts) + nf_ct_acct_add(offload->flow->ct, + FLOW_OFFLOAD_DIR_ORIGINAL, + stats[0].pkts, stats[0].bytes); + if (stats[1].pkts) + nf_ct_acct_add(offload->flow->ct, + FLOW_OFFLOAD_DIR_REPLY, + stats[1].pkts, stats[1].bytes); + } } static void flow_offload_work_handler(struct work_struct *work) { - struct flow_offload_work *offload, *next; - LIST_HEAD(offload_pending_list); - - spin_lock_bh(&flow_offload_pending_list_lock); - list_replace_init(&flow_offload_pending_list, &offload_pending_list); - spin_unlock_bh(&flow_offload_pending_list_lock); + struct flow_offload_work *offload; - list_for_each_entry_safe(offload, next, &offload_pending_list, list) { - switch (offload->cmd) { + offload = container_of(work, struct flow_offload_work, work); + switch (offload->cmd) { case FLOW_CLS_REPLACE: flow_offload_work_add(offload); break; @@ -713,19 +815,14 @@ static void flow_offload_work_handler(struct work_struct *work) break; default: WARN_ON_ONCE(1); - } - list_del(&offload->list); - kfree(offload); } + + kfree(offload); } static void flow_offload_queue_work(struct flow_offload_work *offload) { - spin_lock_bh(&flow_offload_pending_list_lock); - list_add_tail(&offload->list, &flow_offload_pending_list); - spin_unlock_bh(&flow_offload_pending_list_lock); - - schedule_work(&nf_flow_offload_work); + queue_work(nf_flow_offload_wq, &offload->work); } static struct flow_offload_work * @@ -742,6 +839,7 @@ nf_flow_offload_work_alloc(struct nf_flowtable *flowtable, offload->flow = flow; offload->priority = flowtable->priority; offload->flowtable = flowtable; + INIT_WORK(&offload->work, flow_offload_work_handler); return offload; } @@ -792,7 +890,7 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) { if (nf_flowtable_hw_offload(flowtable)) - flush_work(&nf_flow_offload_work); + flush_workqueue(nf_flow_offload_wq); } static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, @@ -820,25 +918,47 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, return err; } -static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, - struct nf_flowtable *flowtable, - struct net_device *dev, - enum flow_block_command cmd, - struct netlink_ext_ack *extack) +static void nf_flow_table_block_offload_init(struct flow_block_offload *bo, + struct net *net, + enum flow_block_command cmd, + struct nf_flowtable *flowtable, + struct netlink_ext_ack *extack) { - int err; - - if (!dev->netdev_ops->ndo_setup_tc) - return -EOPNOTSUPP; - memset(bo, 0, sizeof(*bo)); - bo->net = dev_net(dev); + bo->net = net; bo->block = &flowtable->flow_block; bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; INIT_LIST_HEAD(&bo->cb_list); +} +static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo, + struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd, + struct netlink_ext_ack *extack) +{ + nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, + extack); + flow_indr_block_call(dev, bo, cmd, TC_SETUP_FT); + + if (list_empty(&bo->cb_list)) + return -EOPNOTSUPP; + + return 0; +} + +static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, + struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd, + struct netlink_ext_ack *extack) +{ + int err; + + nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, + extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo); if (err < 0) return err; @@ -857,7 +977,12 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, if (!nf_flowtable_hw_offload(flowtable)) return 0; - err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack); + if (dev->netdev_ops->ndo_setup_tc) + err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, + &extack); + else + err = nf_flow_table_indr_offload_cmd(&bo, flowtable, dev, cmd, + &extack); if (err < 0) return err; @@ -865,22 +990,83 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, } EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup); -int nf_flow_table_offload_init(void) +static void nf_flow_table_indr_block_ing_cmd(struct net_device *dev, + struct nf_flowtable *flowtable, + flow_indr_block_bind_cb_t *cb, + void *cb_priv, + enum flow_block_command cmd) { - INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler); + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; - return 0; + if (!flowtable) + return; + + nf_flow_table_block_offload_init(&bo, dev_net(dev), cmd, flowtable, + &extack); + + cb(dev, cb_priv, TC_SETUP_FT, &bo); + + nf_flow_table_block_setup(flowtable, &bo, cmd); } -void nf_flow_table_offload_exit(void) +static void nf_flow_table_indr_block_cb_cmd(struct nf_flowtable *flowtable, + struct net_device *dev, + flow_indr_block_bind_cb_t *cb, + void *cb_priv, + enum flow_block_command cmd) { - struct flow_offload_work *offload, *next; - LIST_HEAD(offload_pending_list); + if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD)) + return; - cancel_work_sync(&nf_flow_offload_work); + nf_flow_table_indr_block_ing_cmd(dev, flowtable, cb, cb_priv, cmd); +} - list_for_each_entry_safe(offload, next, &offload_pending_list, list) { - list_del(&offload->list); - kfree(offload); +static void nf_flow_table_indr_block_cb(struct net_device *dev, + flow_indr_block_bind_cb_t *cb, + void *cb_priv, + enum flow_block_command cmd) +{ + struct net *net = dev_net(dev); + struct nft_flowtable *nft_ft; + struct nft_table *table; + struct nft_hook *hook; + + mutex_lock(&net->nft.commit_mutex); + list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(nft_ft, &table->flowtables, list) { + list_for_each_entry(hook, &nft_ft->hook_list, list) { + if (hook->ops.dev != dev) + continue; + + nf_flow_table_indr_block_cb_cmd(&nft_ft->data, + dev, cb, + cb_priv, cmd); + } + } } + mutex_unlock(&net->nft.commit_mutex); +} + +static struct flow_indr_block_entry block_ing_entry = { + .cb = nf_flow_table_indr_block_cb, + .list = LIST_HEAD_INIT(block_ing_entry.list), +}; + +int nf_flow_table_offload_init(void) +{ + nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + if (!nf_flow_offload_wq) + return -ENOMEM; + + flow_indr_add_block_cb(&block_ing_entry); + + return 0; +} + +void nf_flow_table_offload_exit(void) +{ + flow_indr_del_block_cb(&block_ing_entry); + destroy_workqueue(nf_flow_offload_wq); } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index f8f52ff99cfb..bbd1209694b8 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -46,25 +46,7 @@ void nf_unregister_queue_handler(struct net *net) } EXPORT_SYMBOL(nf_unregister_queue_handler); -static void nf_queue_entry_release_br_nf_refs(struct sk_buff *skb) -{ -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - - if (nf_bridge) { - struct net_device *physdev; - - physdev = nf_bridge_get_physindev(skb); - if (physdev) - dev_put(physdev); - physdev = nf_bridge_get_physoutdev(skb); - if (physdev) - dev_put(physdev); - } -#endif -} - -void nf_queue_entry_release_refs(struct nf_queue_entry *entry) +static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; @@ -76,24 +58,34 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry) if (state->sk) sock_put(state->sk); - nf_queue_entry_release_br_nf_refs(entry->skb); +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (entry->physin) + dev_put(entry->physin); + if (entry->physout) + dev_put(entry->physout); +#endif +} + +void nf_queue_entry_free(struct nf_queue_entry *entry) +{ + nf_queue_entry_release_refs(entry); + kfree(entry); } -EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs); +EXPORT_SYMBOL_GPL(nf_queue_entry_free); -static void nf_queue_entry_get_br_nf_refs(struct sk_buff *skb) +static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + const struct sk_buff *skb = entry->skb; + struct nf_bridge_info *nf_bridge; + nf_bridge = nf_bridge_info_get(skb); if (nf_bridge) { - struct net_device *physdev; - - physdev = nf_bridge_get_physindev(skb); - if (physdev) - dev_hold(physdev); - physdev = nf_bridge_get_physoutdev(skb); - if (physdev) - dev_hold(physdev); + entry->physin = nf_bridge_get_physindev(skb); + entry->physout = nf_bridge_get_physoutdev(skb); + } else { + entry->physin = NULL; + entry->physout = NULL; } #endif } @@ -110,7 +102,12 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) if (state->sk) sock_hold(state->sk); - nf_queue_entry_get_br_nf_refs(entry->skb); +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + if (entry->physin) + dev_hold(entry->physin); + if (entry->physout) + dev_hold(entry->physout); +#endif } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); @@ -158,18 +155,16 @@ static void nf_ip6_saveroute(const struct sk_buff *skb, static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, unsigned int index, unsigned int queuenum) { - int status = -ENOENT; struct nf_queue_entry *entry = NULL; const struct nf_queue_handler *qh; struct net *net = state->net; unsigned int route_key_size; + int status; /* QUEUE == DROP if no one is waiting, to be safe. */ qh = rcu_dereference(net->nf.queue_handler); - if (!qh) { - status = -ESRCH; - goto err; - } + if (!qh) + return -ESRCH; switch (state->pf) { case AF_INET: @@ -184,14 +179,12 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, } entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC); - if (!entry) { - status = -ENOMEM; - goto err; - } + if (!entry) + return -ENOMEM; if (skb_dst(skb) && !skb_dst_force(skb)) { - status = -ENETDOWN; - goto err; + kfree(entry); + return -ENETDOWN; } *entry = (struct nf_queue_entry) { @@ -201,6 +194,8 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, .size = sizeof(*entry) + route_key_size, }; + __nf_queue_entry_init_physdevs(entry); + nf_queue_entry_get_refs(entry); switch (entry->state.pf) { @@ -213,17 +208,12 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, } status = qh->outfn(entry, queuenum); - if (status < 0) { - nf_queue_entry_release_refs(entry); - goto err; + nf_queue_entry_free(entry); + return status; } return 0; - -err: - kfree(entry); - return status; } /* Packets leaving via this function must come back through nf_reinject(). */ @@ -304,12 +294,10 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) hooks = nf_hook_entries_head(net, pf, entry->state.hook); - nf_queue_entry_release_refs(entry); - i = entry->hook_index; if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) { kfree_skb(skb); - kfree(entry); + nf_queue_entry_free(entry); return; } @@ -348,6 +336,6 @@ next_hook: kfree_skb(skb); } - kfree(entry); + nf_queue_entry_free(entry); } EXPORT_SYMBOL(nf_reinject); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 38c680f28f15..d0ab5ffa1e2c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -520,7 +520,8 @@ static struct nft_table *nft_table_lookup(const struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &net->nft.tables, list, + lockdep_is_held(&net->nft.commit_mutex)) { if (!nla_strcmp(nla, table->name) && table->family == family && nft_active_genmask(table, genmask)) @@ -2523,8 +2524,8 @@ static void nf_tables_expr_destroy(const struct nft_ctx *ctx, module_put(type->owner); } -struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, - const struct nlattr *nla) +static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, + const struct nlattr *nla) { struct nft_expr_info info; struct nft_expr *expr; @@ -2557,6 +2558,24 @@ err1: return ERR_PTR(err); } +int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) +{ + int err; + + if (src->ops->clone) { + dst->ops = src->ops; + err = src->ops->clone(dst, src); + if (err < 0) + return err; + } else { + memcpy(dst, src, src->ops->size); + } + + __module_get(src->ops->type->owner); + + return 0; +} + void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) { nf_tables_expr_destroy(ctx, expr); @@ -3266,25 +3285,17 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, /* * Sets */ - -static LIST_HEAD(nf_tables_set_types); - -int nft_register_set(struct nft_set_type *type) -{ - nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_add_tail_rcu(&type->list, &nf_tables_set_types); - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - return 0; -} -EXPORT_SYMBOL_GPL(nft_register_set); - -void nft_unregister_set(struct nft_set_type *type) -{ - nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_del_rcu(&type->list); - nfnl_unlock(NFNL_SUBSYS_NFTABLES); -} -EXPORT_SYMBOL_GPL(nft_unregister_set); +static const struct nft_set_type *nft_set_types[] = { + &nft_set_hash_fast_type, + &nft_set_hash_type, + &nft_set_rhash_type, + &nft_set_bitmap_type, + &nft_set_rbtree_type, +#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2) + &nft_set_pipapo_avx2_type, +#endif + &nft_set_pipapo_type, +}; #define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \ NFT_SET_TIMEOUT | NFT_SET_OBJECT | \ @@ -3310,15 +3321,11 @@ nft_select_set_ops(const struct nft_ctx *ctx, struct nft_set_estimate est, best; const struct nft_set_type *type; u32 flags = 0; + int i; lockdep_assert_held(&ctx->net->nft.commit_mutex); lockdep_nfnl_nft_mutex_not_held(); -#ifdef CONFIG_MODULES - if (list_empty(&nf_tables_set_types)) { - if (nft_request_module(ctx->net, "nft-set") == -EAGAIN) - return ERR_PTR(-EAGAIN); - } -#endif + if (nla[NFTA_SET_FLAGS] != NULL) flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); @@ -3327,7 +3334,8 @@ nft_select_set_ops(const struct nft_ctx *ctx, best.lookup = ~0; best.space = ~0; - list_for_each_entry(type, &nf_tables_set_types, list) { + for (i = 0; i < ARRAY_SIZE(nft_set_types); i++) { + type = nft_set_types[i]; ops = &type->ops; if (!nft_set_ops_candidate(type, flags)) @@ -3358,11 +3366,6 @@ nft_select_set_ops(const struct nft_ctx *ctx, break; } - if (!try_module_get(type->owner)) - continue; - if (bops != NULL) - module_put(to_set_type(bops)->owner); - bops = ops; best = est; } @@ -3392,6 +3395,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { .len = NFT_USERDATA_MAXLEN }, [NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_SET_HANDLE] = { .type = NLA_U64 }, + [NFTA_SET_EXPR] = { .type = NLA_NESTED }, }; static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { @@ -3595,8 +3599,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, { struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; - struct nlattr *desc; u32 portid = ctx->portid; + struct nlattr *nest; u32 seq = ctx->seq; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); @@ -3652,9 +3656,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata)) goto nla_put_failure; - desc = nla_nest_start_noflag(skb, NFTA_SET_DESC); - - if (desc == NULL) + nest = nla_nest_start_noflag(skb, NFTA_SET_DESC); + if (!nest) goto nla_put_failure; if (set->size && nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) @@ -3664,7 +3667,15 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nf_tables_fill_set_concat(skb, set)) goto nla_put_failure; - nla_nest_end(skb, desc); + nla_nest_end(skb, nest); + + if (set->expr) { + nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); + if (nf_tables_fill_expr_info(skb, set->expr) < 0) + goto nla_put_failure; + + nla_nest_end(skb, nest); + } nlmsg_end(skb, nlh); return 0; @@ -3911,6 +3922,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; const struct nft_set_ops *ops; + struct nft_expr *expr = NULL; struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; @@ -4020,6 +4032,9 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, return err; } + if (nla[NFTA_SET_EXPR]) + desc.expr = true; + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask); if (IS_ERR(table)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); @@ -4061,21 +4076,27 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, size = ops->privsize(nla, &desc); set = kvzalloc(sizeof(*set) + size + udlen, GFP_KERNEL); - if (!set) { - err = -ENOMEM; - goto err1; - } + if (!set) + return -ENOMEM; name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL); if (!name) { err = -ENOMEM; - goto err2; + goto err_set_name; } err = nf_tables_set_alloc_name(&ctx, set, name); kfree(name); if (err < 0) - goto err2; + goto err_set_alloc_name; + + if (nla[NFTA_SET_EXPR]) { + expr = nft_set_elem_expr_alloc(&ctx, set, nla[NFTA_SET_EXPR]); + if (IS_ERR(expr)) { + err = PTR_ERR(expr); + goto err_set_alloc_name; + } + } udata = NULL; if (udlen) { @@ -4092,6 +4113,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->dtype = dtype; set->objtype = objtype; set->dlen = desc.dlen; + set->expr = expr; set->flags = flags; set->size = desc.size; set->policy = policy; @@ -4107,34 +4129,37 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, err = ops->init(set, &desc, nla); if (err < 0) - goto err3; + goto err_set_init; err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) - goto err4; + goto err_set_trans; list_add_tail_rcu(&set->list, &table->sets); table->use++; return 0; -err4: +err_set_trans: ops->destroy(set); -err3: +err_set_init: + if (expr) + nft_expr_destroy(&ctx, expr); +err_set_alloc_name: kfree(set->name); -err2: +err_set_name: kvfree(set); -err1: - module_put(to_set_type(ops)->owner); return err; } -static void nft_set_destroy(struct nft_set *set) +static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { if (WARN_ON(set->use > 0)) return; + if (set->expr) + nft_expr_destroy(ctx, set->expr); + set->ops->destroy(set); - module_put(to_set_type(set->ops)->owner); kfree(set->name); kvfree(set); } @@ -4274,7 +4299,7 @@ EXPORT_SYMBOL_GPL(nf_tables_deactivate_set); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set) { if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) - nft_set_destroy(set); + nft_set_destroy(ctx, set); } EXPORT_SYMBOL_GPL(nf_tables_destroy_set); @@ -4312,7 +4337,6 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .align = __alignof__(u32), }, }; -EXPORT_SYMBOL_GPL(nft_set_ext_types); /* * Set elements @@ -4801,6 +4825,36 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, return trans; } +struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nlattr *attr) +{ + struct nft_expr *expr; + int err; + + expr = nft_expr_init(ctx, attr); + if (IS_ERR(expr)) + return expr; + + err = -EOPNOTSUPP; + if (!(expr->ops->type->flags & NFT_EXPR_STATEFUL)) + goto err_set_elem_expr; + + if (expr->ops->type->flags & NFT_EXPR_GC) { + if (set->flags & NFT_SET_TIMEOUT) + goto err_set_elem_expr; + if (!set->ops->gc_init) + goto err_set_elem_expr; + set->ops->gc_init(set); + } + + return expr; + +err_set_elem_expr: + nft_expr_destroy(ctx, expr); + return ERR_PTR(err); +} + void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, @@ -4832,6 +4886,17 @@ void *nft_set_elem_init(const struct nft_set *set, return elem; } +static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, + struct nft_expr *expr) +{ + if (expr->ops->destroy_clone) { + expr->ops->destroy_clone(ctx, expr); + module_put(expr->ops->type->owner); + } else { + nf_tables_expr_destroy(ctx, expr); + } +} + void nft_set_elem_destroy(const struct nft_set *set, void *elem, bool destroy_expr) { @@ -4844,16 +4909,9 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); - if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) { - struct nft_expr *expr = nft_set_ext_expr(ext); + if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext)); - if (expr->ops->destroy_clone) { - expr->ops->destroy_clone(&ctx, expr); - module_put(expr->ops->type->owner); - } else { - nf_tables_expr_destroy(&ctx, expr); - } - } if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) (*nft_set_ext_obj(ext))->use--; kfree(elem); @@ -4869,7 +4927,8 @@ static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, struct nft_set_ext *ext = nft_set_elem_ext(set, elem); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) - nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext)); + nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); + kfree(elem); } @@ -4883,6 +4942,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_object *obj = NULL; + struct nft_expr *expr = NULL; struct nft_userdata *udata; struct nft_data_desc desc; struct nft_data data; @@ -4950,10 +5010,29 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } + if (nla[NFTA_SET_ELEM_EXPR] != NULL) { + expr = nft_set_elem_expr_alloc(ctx, set, + nla[NFTA_SET_ELEM_EXPR]); + if (IS_ERR(expr)) + return PTR_ERR(expr); + + err = -EOPNOTSUPP; + if (set->expr && set->expr->ops != expr->ops) + goto err_set_elem_expr; + } else if (set->expr) { + expr = kzalloc(set->expr->ops->size, GFP_KERNEL); + if (!expr) + return -ENOMEM; + + err = nft_expr_clone(expr, set->expr); + if (err < 0) + goto err_set_elem_expr; + } + err = nft_setelem_parse_key(ctx, set, &elem.key.val, nla[NFTA_SET_ELEM_KEY]); if (err < 0) - return err; + goto err_set_elem_expr; nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); @@ -4972,6 +5051,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); } + if (expr) + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPR, + expr->ops->size); + if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { if (!(set->flags & NFT_SET_OBJECT)) { err = -EINVAL; @@ -5056,6 +5139,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, *nft_set_ext_obj(ext) = obj; obj->use++; } + if (expr) { + memcpy(nft_set_ext_expr(ext), expr, expr->ops->size); + kfree(expr); + expr = NULL; + } trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) @@ -5082,6 +5170,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -EBUSY; else if (!(nlmsg_flags & NLM_F_EXCL)) err = 0; + } else if (err == -ENOTEMPTY) { + /* ENOTEMPTY reports overlapping between this element + * and an existing one. + */ + err = -EEXIST; } goto err_element_clash; } @@ -5103,7 +5196,8 @@ err_element_clash: err_trans: if (obj) obj->use--; - kfree(elem.priv); + + nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) nft_data_release(&data, desc.type); @@ -5111,6 +5205,9 @@ err_parse_key_end: nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); err_parse_key: nft_data_release(&elem.key.val, NFT_DATA_VALUE); +err_set_elem_expr: + if (expr != NULL) + nft_expr_destroy(ctx, expr); return err; } @@ -5365,7 +5462,6 @@ void nft_set_gc_batch_release(struct rcu_head *rcu) nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); kfree(gcb); } -EXPORT_SYMBOL_GPL(nft_set_gc_batch_release); struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, gfp_t gfp) @@ -5378,7 +5474,6 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, gcb->head.set = set; return gcb; } -EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc); /* * Stateful objects @@ -6289,7 +6384,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (nla[NFTA_FLOWTABLE_FLAGS]) { flowtable->data.flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS])); - if (flowtable->data.flags & ~NF_FLOWTABLE_HW_OFFLOAD) + if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) goto err3; } @@ -6977,7 +7072,7 @@ static void nft_commit_release(struct nft_trans *trans) nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_DELSET: - nft_set_destroy(nft_trans_set(trans)); + nft_set_destroy(&trans->ctx, nft_trans_set(trans)); break; case NFT_MSG_DELSETELEM: nf_tables_set_elem_destroy(&trans->ctx, @@ -7408,7 +7503,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); break; case NFT_MSG_NEWSET: - nft_set_destroy(nft_trans_set(trans)); + nft_set_destroy(&trans->ctx, nft_trans_set(trans)); break; case NFT_MSG_NEWSETELEM: nft_set_elem_destroy(nft_trans_elem_set(trans), @@ -8134,7 +8229,7 @@ static void __nft_release_tables(struct net *net) list_for_each_entry_safe(set, ns, &table->sets, list) { list_del(&set->list); table->use--; - nft_set_destroy(set); + nft_set_destroy(&ctx, set); } list_for_each_entry_safe(obj, ne, &table->objects, list) { nft_obj_del(obj); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 2bb28483af22..954bccb7f32a 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -313,7 +313,7 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *chain, nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); - flow_indr_block_call(dev, &bo, cmd); + flow_indr_block_call(dev, &bo, cmd, TC_SETUP_BLOCK); if (list_empty(&bo.cb_list)) return -EOPNOTSUPP; diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c deleted file mode 100644 index 586b621007eb..000000000000 --- a/net/netfilter/nf_tables_set_core.c +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include <linux/module.h> -#include <net/netfilter/nf_tables_core.h> - -static int __init nf_tables_set_module_init(void) -{ - nft_register_set(&nft_set_hash_fast_type); - nft_register_set(&nft_set_hash_type); - nft_register_set(&nft_set_rhash_type); - nft_register_set(&nft_set_bitmap_type); - nft_register_set(&nft_set_rbtree_type); - nft_register_set(&nft_set_pipapo_type); - - return 0; -} - -static void __exit nf_tables_set_module_exit(void) -{ - nft_unregister_set(&nft_set_pipapo_type); - nft_unregister_set(&nft_set_rbtree_type); - nft_unregister_set(&nft_set_bitmap_type); - nft_unregister_set(&nft_set_rhash_type); - nft_unregister_set(&nft_set_hash_type); - nft_unregister_set(&nft_set_hash_fast_type); -} - -module_init(nf_tables_set_module_init); -module_exit(nf_tables_set_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 2481470dec36..5827117f2635 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -33,7 +33,7 @@ struct nf_acct { refcount_t refcnt; char name[NFACCT_NAME_MAX]; struct rcu_head rcu_head; - char data[0]; + char data[]; }; struct nfacct_filter { diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 76535fd9278c..3243a31f6e82 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -737,12 +737,6 @@ static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) #define nf_bridge_adjust_segmented_data(s) do {} while (0) #endif -static void free_entry(struct nf_queue_entry *entry) -{ - nf_queue_entry_release_refs(entry); - kfree(entry); -} - static int __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, struct sk_buff *skb, struct nf_queue_entry *entry) @@ -768,7 +762,7 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, entry_seg->skb = skb; ret = __nfqnl_enqueue_packet(net, queue, entry_seg); if (ret) - free_entry(entry_seg); + nf_queue_entry_free(entry_seg); } return ret; } @@ -827,7 +821,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) if (queued) { if (err) /* some segments are already queued */ - free_entry(entry); + nf_queue_entry_free(entry); kfree_skb(skb); return 0; } diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 0ed2281f03be..bc37d6c59db4 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -93,7 +93,7 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { static int nft_bitwise_init_bool(struct nft_bitwise *priv, const struct nlattr *const tb[]) { - struct nft_data_desc d1, d2; + struct nft_data_desc mask, xor; int err; if (tb[NFTA_BITWISE_DATA]) @@ -103,29 +103,29 @@ static int nft_bitwise_init_bool(struct nft_bitwise *priv, !tb[NFTA_BITWISE_XOR]) return -EINVAL; - err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1, + err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &mask, tb[NFTA_BITWISE_MASK]); if (err < 0) return err; - if (d1.type != NFT_DATA_VALUE || d1.len != priv->len) { + if (mask.type != NFT_DATA_VALUE || mask.len != priv->len) { err = -EINVAL; goto err1; } - err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &d2, + err = nft_data_init(NULL, &priv->xor, sizeof(priv->xor), &xor, tb[NFTA_BITWISE_XOR]); if (err < 0) goto err1; - if (d2.type != NFT_DATA_VALUE || d2.len != priv->len) { + if (xor.type != NFT_DATA_VALUE || xor.len != priv->len) { err = -EINVAL; goto err2; } return 0; err2: - nft_data_release(&priv->xor, d2.type); + nft_data_release(&priv->xor, xor.type); err1: - nft_data_release(&priv->mask, d1.type); + nft_data_release(&priv->mask, mask.type); return err; } diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 683785225a3e..64ca13a1885b 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -24,23 +24,6 @@ struct nft_dynset { struct nft_set_binding binding; }; -static int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) -{ - int err; - - if (src->ops->clone) { - dst->ops = src->ops; - err = src->ops->clone(dst, src); - if (err < 0) - return err; - } else { - memcpy(dst, src, src->ops->size); - } - - __module_get(src->ops->type->owner); - return 0; -} - static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, struct nft_regs *regs) { @@ -81,7 +64,6 @@ void nft_dynset_eval(const struct nft_expr *expr, const struct nft_dynset *priv = nft_expr_priv(expr); struct nft_set *set = priv->set; const struct nft_set_ext *ext; - const struct nft_expr *sexpr; u64 timeout; if (priv->op == NFT_DYNSET_OP_DELETE) { @@ -91,18 +73,13 @@ void nft_dynset_eval(const struct nft_expr *expr, if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, expr, regs, &ext)) { - sexpr = NULL; - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) - sexpr = nft_set_ext_expr(ext); - if (priv->op == NFT_DYNSET_OP_UPDATE && nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { timeout = priv->timeout ? : set->timeout; *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout; } - if (sexpr != NULL) - sexpr->ops->eval(sexpr, regs, pkt); + nft_set_elem_update_expr(ext, regs, pkt); if (priv->invert) regs->verdict.code = NFT_BREAK; @@ -206,20 +183,14 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_EVAL)) return -EINVAL; - priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]); + priv->expr = nft_set_elem_expr_alloc(ctx, set, + tb[NFTA_DYNSET_EXPR]); if (IS_ERR(priv->expr)) return PTR_ERR(priv->expr); - err = -EOPNOTSUPP; - if (!(priv->expr->ops->type->flags & NFT_EXPR_STATEFUL)) - goto err1; - - if (priv->expr->ops->type->flags & NFT_EXPR_GC) { - if (set->flags & NFT_SET_TIMEOUT) - goto err1; - if (!set->ops->gc_init) - goto err1; - set->ops->gc_init(set); + if (set->expr && set->expr->ops != priv->expr->ops) { + err = -EOPNOTSUPP; + goto err_expr_free; } } @@ -239,7 +210,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, err = nf_tables_bind_set(ctx, set, &priv->binding); if (err < 0) - goto err1; + goto err_expr_free; if (set->size == 0) set->size = 0xffff; @@ -247,7 +218,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, priv->set = set; return 0; -err1: +err_expr_free: if (priv->expr != NULL) nft_expr_destroy(ctx, priv->expr); return err; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a5e8469859e3..07782836fad6 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -228,7 +228,6 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, unsigned int i, optl, tcphdr_len, offset; struct tcphdr *tcph; u8 *opt; - u32 src; tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); if (!tcph) @@ -237,7 +236,6 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, opt = (u8 *)tcph; for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { union { - u8 octet; __be16 v16; __be32 v32; } old, new; @@ -259,13 +257,13 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, if (!tcph) return; - src = regs->data[priv->sreg]; offset = i + priv->offset; switch (priv->len) { case 2: old.v16 = get_unaligned((u16 *)(opt + offset)); - new.v16 = src; + new.v16 = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg]); switch (priv->type) { case TCPOPT_MSS: @@ -283,7 +281,7 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, old.v16, new.v16, false); break; case 4: - new.v32 = src; + new.v32 = regs->data[priv->sreg]; old.v32 = get_unaligned((u32 *)(opt + offset)); if (old.v32 == new.v32) diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index aba11c2333f3..3087e23297db 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -28,6 +28,9 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, struct nft_fwd_netdev *priv = nft_expr_priv(expr); int oif = regs->data[priv->sreg_dev]; + /* This is used by ifb only. */ + skb_set_redirected(pkt->skb, true); + nf_fwd_netdev_egress(pkt, oif); regs->verdict.code = NF_STOLEN; } @@ -190,6 +193,13 @@ nla_put_failure: return -1; } +static int nft_fwd_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS)); +} + static struct nft_expr_type nft_fwd_netdev_type; static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = { .type = &nft_fwd_netdev_type, @@ -197,6 +207,7 @@ static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = { .eval = nft_fwd_neigh_eval, .init = nft_fwd_neigh_init, .dump = nft_fwd_neigh_dump, + .validate = nft_fwd_validate, }; static const struct nft_expr_ops nft_fwd_netdev_ops = { @@ -205,6 +216,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .eval = nft_fwd_netdev_eval, .init = nft_fwd_netdev_init, .dump = nft_fwd_netdev_dump, + .validate = nft_fwd_validate, .offload = nft_fwd_netdev_offload, }; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 660bad688e2b..1e70359d633c 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -43,6 +43,7 @@ void nft_lookup_eval(const struct nft_expr *expr, nft_data_copy(®s->data[priv->dreg], nft_set_ext_data(ext), set->dlen); + nft_set_elem_update_expr(ext, regs, pkt); } static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 87e8d9ba0c9b..32f0fc8be3a4 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -81,6 +81,7 @@ static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, u32 idx, off; nft_bitmap_location(set, key, &idx, &off); + *ext = NULL; return nft_bitmap_active(priv->bitmap, idx, off, genmask); } @@ -285,6 +286,8 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, /* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */ if (desc->klen > 2) return false; + else if (desc->expr) + return false; est->size = nft_bitmap_total_size(desc->klen); est->lookup = NFT_SET_CLASS_O_1; @@ -293,8 +296,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, return true; } -struct nft_set_type nft_set_bitmap_type __read_mostly = { - .owner = THIS_MODULE, +const struct nft_set_type nft_set_bitmap_type = { .ops = { .privsize = nft_bitmap_privsize, .elemsize = offsetof(struct nft_bitmap_elem, ext), diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index d350a7cd3af0..4d3f147e8d8d 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -662,8 +662,7 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features return true; } -struct nft_set_type nft_set_rhash_type __read_mostly = { - .owner = THIS_MODULE, +const struct nft_set_type nft_set_rhash_type = { .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT | NFT_SET_EVAL, .ops = { @@ -686,8 +685,7 @@ struct nft_set_type nft_set_rhash_type __read_mostly = { }, }; -struct nft_set_type nft_set_hash_type __read_mostly = { - .owner = THIS_MODULE, +const struct nft_set_type nft_set_hash_type = { .features = NFT_SET_MAP | NFT_SET_OBJECT, .ops = { .privsize = nft_hash_privsize, @@ -706,8 +704,7 @@ struct nft_set_type nft_set_hash_type __read_mostly = { }, }; -struct nft_set_type nft_set_hash_fast_type __read_mostly = { - .owner = THIS_MODULE, +const struct nft_set_type nft_set_hash_fast_type = { .features = NFT_SET_MAP | NFT_SET_OBJECT, .ops = { .privsize = nft_hash_privsize, diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 4fc0c924ed5d..87aabf651cfe 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -330,144 +330,22 @@ #include <linux/kernel.h> #include <linux/init.h> -#include <linux/log2.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <uapi/linux/netfilter/nf_tables.h> -#include <net/ipv6.h> /* For the maximum length of a field */ #include <linux/bitmap.h> #include <linux/bitops.h> -/* Count of concatenated fields depends on count of 32-bit nftables registers */ -#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT - -/* Largest supported field size */ -#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr)) -#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE) - -/* Number of bits to be grouped together in lookup table buckets, arbitrary */ -#define NFT_PIPAPO_GROUP_BITS 4 -#define NFT_PIPAPO_GROUPS_PER_BYTE (BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS) - -/* Fields are padded to 32 bits in input registers */ -#define NFT_PIPAPO_GROUPS_PADDED_SIZE(x) \ - (round_up((x) / NFT_PIPAPO_GROUPS_PER_BYTE, sizeof(u32))) -#define NFT_PIPAPO_GROUPS_PADDING(x) \ - (NFT_PIPAPO_GROUPS_PADDED_SIZE((x)) - (x) / NFT_PIPAPO_GROUPS_PER_BYTE) - -/* Number of buckets, given by 2 ^ n, with n grouped bits */ -#define NFT_PIPAPO_BUCKETS (1 << NFT_PIPAPO_GROUP_BITS) - -/* Each n-bit range maps to up to n * 2 rules */ -#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2)) - -/* Use the rest of mapping table buckets for rule indices, but it makes no sense - * to exceed 32 bits - */ -#if BITS_PER_LONG == 64 -#define NFT_PIPAPO_MAP_TOBITS 32 -#else -#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS) -#endif - -/* ...which gives us the highest allowed index for a rule */ -#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \ - - (1UL << NFT_PIPAPO_MAP_NBITS)) - -#define nft_pipapo_for_each_field(field, index, match) \ - for ((field) = (match)->f, (index) = 0; \ - (index) < (match)->field_count; \ - (index)++, (field)++) - -/** - * union nft_pipapo_map_bucket - Bucket of mapping table - * @to: First rule number (in next field) this rule maps to - * @n: Number of rules (in next field) this rule maps to - * @e: If there's no next field, pointer to element this rule maps to - */ -union nft_pipapo_map_bucket { - struct { -#if BITS_PER_LONG == 64 - static_assert(NFT_PIPAPO_MAP_TOBITS <= 32); - u32 to; - - static_assert(NFT_PIPAPO_MAP_NBITS <= 32); - u32 n; -#else - unsigned long to:NFT_PIPAPO_MAP_TOBITS; - unsigned long n:NFT_PIPAPO_MAP_NBITS; -#endif - }; - struct nft_pipapo_elem *e; -}; - -/** - * struct nft_pipapo_field - Lookup, mapping tables and related data for a field - * @groups: Amount of 4-bit groups - * @rules: Number of inserted rules - * @bsize: Size of each bucket in lookup table, in longs - * @lt: Lookup table: 'groups' rows of NFT_PIPAPO_BUCKETS buckets - * @mt: Mapping table: one bucket per rule - */ -struct nft_pipapo_field { - int groups; - unsigned long rules; - size_t bsize; - unsigned long *lt; - union nft_pipapo_map_bucket *mt; -}; - -/** - * struct nft_pipapo_match - Data used for lookup and matching - * @field_count Amount of fields in set - * @scratch: Preallocated per-CPU maps for partial matching results - * @bsize_max: Maximum lookup table bucket size of all fields, in longs - * @rcu Matching data is swapped on commits - * @f: Fields, with lookup and mapping tables - */ -struct nft_pipapo_match { - int field_count; - unsigned long * __percpu *scratch; - size_t bsize_max; - struct rcu_head rcu; - struct nft_pipapo_field f[0]; -}; +#include "nft_set_pipapo_avx2.h" +#include "nft_set_pipapo.h" /* Current working bitmap index, toggled between field matches */ static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); /** - * struct nft_pipapo - Representation of a set - * @match: Currently in-use matching data - * @clone: Copy where pending insertions and deletions are kept - * @groups: Total amount of 4-bit groups for fields in this set - * @width: Total bytes to be matched for one packet, including padding - * @dirty: Working copy has pending insertions or deletions - * @last_gc: Timestamp of last garbage collection run, jiffies - */ -struct nft_pipapo { - struct nft_pipapo_match __rcu *match; - struct nft_pipapo_match *clone; - int groups; - int width; - bool dirty; - unsigned long last_gc; -}; - -struct nft_pipapo_elem; - -/** - * struct nft_pipapo_elem - API-facing representation of single set element - * @ext: nftables API extensions - */ -struct nft_pipapo_elem { - struct nft_set_ext ext; -}; - -/** * pipapo_refill() - For each set bit, set bits from selected mapping table item * @map: Bitmap to be scanned for set bits * @len: Length of bitmap in longs @@ -484,9 +362,8 @@ struct nft_pipapo_elem { * * Return: -1 on no match, bit position on 'match_only', 0 otherwise. */ -static int pipapo_refill(unsigned long *map, int len, int rules, - unsigned long *dst, union nft_pipapo_map_bucket *mt, - bool match_only) +int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, + union nft_pipapo_map_bucket *mt, bool match_only) { unsigned long bitset; int k, ret = -1; @@ -559,26 +436,18 @@ static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; - unsigned long *lt = f->lt; - int b, group; + int b; - /* For each 4-bit group: select lookup table bucket depending on + /* For each bit group: select lookup table bucket depending on * packet bytes value, then AND bucket value */ - for (group = 0; group < f->groups; group += 2) { - u8 v; - - v = *rp >> 4; - __bitmap_and(res_map, res_map, lt + v * f->bsize, - f->bsize * BITS_PER_LONG); - lt += f->bsize * NFT_PIPAPO_BUCKETS; - - v = *rp & 0x0f; - rp++; - __bitmap_and(res_map, res_map, lt + v * f->bsize, - f->bsize * BITS_PER_LONG); - lt += f->bsize * NFT_PIPAPO_BUCKETS; - } + if (likely(f->bb == 8)) + pipapo_and_field_buckets_8bit(f, res_map, rp); + else + pipapo_and_field_buckets_4bit(f, res_map, rp); + NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; + + rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); /* Now populate the bitmap for the next field, unless this is * the last field, in which case return the matched 'ext' @@ -621,7 +490,7 @@ next_match: map_index = !map_index; swap(res_map, fill_map); - rp += NFT_PIPAPO_GROUPS_PADDING(f->groups); + rp += NFT_PIPAPO_GROUPS_PADDING(f); } out: @@ -669,26 +538,19 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1; - unsigned long *lt = f->lt; - int b, group; + int b; - /* For each 4-bit group: select lookup table bucket depending on + /* For each bit group: select lookup table bucket depending on * packet bytes value, then AND bucket value */ - for (group = 0; group < f->groups; group++) { - u8 v; - - if (group % 2) { - v = *data & 0x0f; - data++; - } else { - v = *data >> 4; - } - __bitmap_and(res_map, res_map, lt + v * f->bsize, - f->bsize * BITS_PER_LONG); + if (f->bb == 8) + pipapo_and_field_buckets_8bit(f, res_map, data); + else if (f->bb == 4) + pipapo_and_field_buckets_4bit(f, res_map, data); + else + BUG(); - lt += f->bsize * NFT_PIPAPO_BUCKETS; - } + data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); /* Now populate the bitmap for the next field, unless this is * the last field, in which case return the matched 'ext' @@ -713,7 +575,7 @@ next_match: goto out; } - data += NFT_PIPAPO_GROUPS_PADDING(f->groups); + data += NFT_PIPAPO_GROUPS_PADDING(f); /* Swap bitmap indices: fill_map will be the initial bitmap for * the next field (i.e. the new res_map), and res_map is @@ -736,8 +598,8 @@ out: * @elem: nftables API element representation containing key data * @flags: Unused */ -void *nft_pipapo_get(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem, unsigned int flags) +static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) { return pipapo_get(net, set, (const u8 *)elem->key.val.data, nft_genmask_cur(net)); @@ -763,6 +625,10 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) int group, bucket; new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); +#ifdef NFT_PIPAPO_ALIGN + new_bucket_size = roundup(new_bucket_size, + NFT_PIPAPO_ALIGN / sizeof(*new_lt)); +#endif if (new_bucket_size == f->bsize) goto mt; @@ -772,15 +638,18 @@ static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) else copy = new_bucket_size; - new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS * new_bucket_size * - sizeof(*new_lt), GFP_KERNEL); + new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) * + new_bucket_size * sizeof(*new_lt) + + NFT_PIPAPO_ALIGN_HEADROOM, + GFP_KERNEL); if (!new_lt) return -ENOMEM; - new_p = new_lt; - old_p = old_lt; + new_p = NFT_PIPAPO_LT_ALIGN(new_lt); + old_p = NFT_PIPAPO_LT_ALIGN(old_lt); + for (group = 0; group < f->groups; group++) { - for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS; bucket++) { + for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS(f->bb); bucket++) { memcpy(new_p, old_p, copy * sizeof(*new_p)); new_p += copy; old_p += copy; @@ -807,7 +676,7 @@ mt: if (new_lt) { f->bsize = new_bucket_size; - f->lt = new_lt; + NFT_PIPAPO_LT_ASSIGN(f, new_lt); kvfree(old_lt); } @@ -829,13 +698,196 @@ static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, { unsigned long *pos; - pos = f->lt + f->bsize * NFT_PIPAPO_BUCKETS * group; + pos = NFT_PIPAPO_LT_ALIGN(f->lt); + pos += f->bsize * NFT_PIPAPO_BUCKETS(f->bb) * group; pos += f->bsize * v; __set_bit(rule, pos); } /** + * pipapo_lt_4b_to_8b() - Switch lookup table group width from 4 bits to 8 bits + * @old_groups: Number of current groups + * @bsize: Size of one bucket, in longs + * @old_lt: Pointer to the current lookup table + * @new_lt: Pointer to the new, pre-allocated lookup table + * + * Each bucket with index b in the new lookup table, belonging to group g, is + * filled with the bit intersection between: + * - bucket with index given by the upper 4 bits of b, from group g, and + * - bucket with index given by the lower 4 bits of b, from group g + 1 + * + * That is, given buckets from the new lookup table N(x, y) and the old lookup + * table O(x, y), with x bucket index, and y group index: + * + * N(b, g) := O(b / 16, g) & O(b % 16, g + 1) + * + * This ensures equivalence of the matching results on lookup. Two examples in + * pictures: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ... 254 255 + * 0 ^ + * 1 | ^ + * ... ( & ) | + * / \ | + * / \ .-( & )-. + * / bucket \ | | + * group 0 / 1 2 3 \ 4 5 6 7 8 9 10 11 12 13 |14 15 | + * 0 / \ | | + * 1 \ | | + * 2 | --' + * 3 '- + * ... + */ +static void pipapo_lt_4b_to_8b(int old_groups, int bsize, + unsigned long *old_lt, unsigned long *new_lt) +{ + int g, b, i; + + for (g = 0; g < old_groups / 2; g++) { + int src_g0 = g * 2, src_g1 = g * 2 + 1; + + for (b = 0; b < NFT_PIPAPO_BUCKETS(8); b++) { + int src_b0 = b / NFT_PIPAPO_BUCKETS(4); + int src_b1 = b % NFT_PIPAPO_BUCKETS(4); + int src_i0 = src_g0 * NFT_PIPAPO_BUCKETS(4) + src_b0; + int src_i1 = src_g1 * NFT_PIPAPO_BUCKETS(4) + src_b1; + + for (i = 0; i < bsize; i++) { + *new_lt = old_lt[src_i0 * bsize + i] & + old_lt[src_i1 * bsize + i]; + new_lt++; + } + } + } +} + +/** + * pipapo_lt_8b_to_4b() - Switch lookup table group width from 8 bits to 4 bits + * @old_groups: Number of current groups + * @bsize: Size of one bucket, in longs + * @old_lt: Pointer to the current lookup table + * @new_lt: Pointer to the new, pre-allocated lookup table + * + * Each bucket with index b in the new lookup table, belonging to group g, is + * filled with the bit union of: + * - all the buckets with index such that the upper four bits of the lower byte + * equal b, from group g, with g odd + * - all the buckets with index such that the lower four bits equal b, from + * group g, with g even + * + * That is, given buckets from the new lookup table N(x, y) and the old lookup + * table O(x, y), with x bucket index, and y group index: + * + * - with g odd: N(b, g) := U(O(x, g) for each x : x = (b & 0xf0) >> 4) + * - with g even: N(b, g) := U(O(x, g) for each x : x = b & 0x0f) + * + * where U() denotes the arbitrary union operation (binary OR of n terms). This + * ensures equivalence of the matching results on lookup. + */ +static void pipapo_lt_8b_to_4b(int old_groups, int bsize, + unsigned long *old_lt, unsigned long *new_lt) +{ + int g, b, bsrc, i; + + memset(new_lt, 0, old_groups * 2 * NFT_PIPAPO_BUCKETS(4) * bsize * + sizeof(unsigned long)); + + for (g = 0; g < old_groups * 2; g += 2) { + int src_g = g / 2; + + for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { + for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; + bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); + bsrc++) { + if (((bsrc & 0xf0) >> 4) != b) + continue; + + for (i = 0; i < bsize; i++) + new_lt[i] |= old_lt[bsrc * bsize + i]; + } + + new_lt += bsize; + } + + for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { + for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; + bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); + bsrc++) { + if ((bsrc & 0x0f) != b) + continue; + + for (i = 0; i < bsize; i++) + new_lt[i] |= old_lt[bsrc * bsize + i]; + } + + new_lt += bsize; + } + } +} + +/** + * pipapo_lt_bits_adjust() - Adjust group size for lookup table if needed + * @f: Field containing lookup table + */ +static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) +{ + unsigned long *new_lt; + int groups, bb; + size_t lt_size; + + lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize * + sizeof(*f->lt); + + if (f->bb == NFT_PIPAPO_GROUP_BITS_SMALL_SET && + lt_size > NFT_PIPAPO_LT_SIZE_HIGH) { + groups = f->groups * 2; + bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET; + + lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * + sizeof(*f->lt); + } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET && + lt_size < NFT_PIPAPO_LT_SIZE_LOW) { + groups = f->groups / 2; + bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET; + + lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * + sizeof(*f->lt); + + /* Don't increase group width if the resulting lookup table size + * would exceed the upper size threshold for a "small" set. + */ + if (lt_size > NFT_PIPAPO_LT_SIZE_HIGH) + return; + } else { + return; + } + + new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL); + if (!new_lt) + return; + + NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; + if (f->bb == 4 && bb == 8) { + pipapo_lt_4b_to_8b(f->groups, f->bsize, + NFT_PIPAPO_LT_ALIGN(f->lt), + NFT_PIPAPO_LT_ALIGN(new_lt)); + } else if (f->bb == 8 && bb == 4) { + pipapo_lt_8b_to_4b(f->groups, f->bsize, + NFT_PIPAPO_LT_ALIGN(f->lt), + NFT_PIPAPO_LT_ALIGN(new_lt)); + } else { + BUG(); + } + + f->groups = groups; + f->bb = bb; + kvfree(f->lt); + NFT_PIPAPO_LT_ASSIGN(f, new_lt); +} + +/** * pipapo_insert() - Insert new rule in field given input key and mask length * @f: Field containing lookup table * @k: Input key for classification, without nftables padding @@ -849,7 +901,7 @@ static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, int mask_bits) { - int rule = f->rules++, group, ret; + int rule = f->rules++, group, ret, bit_offset = 0; ret = pipapo_resize(f, f->rules - 1, f->rules); if (ret) @@ -859,28 +911,33 @@ static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, int i, v; u8 mask; - if (group % 2) - v = k[group / 2] & 0x0f; - else - v = k[group / 2] >> 4; + v = k[group / (BITS_PER_BYTE / f->bb)]; + v &= GENMASK(BITS_PER_BYTE - bit_offset - 1, 0); + v >>= (BITS_PER_BYTE - bit_offset) - f->bb; - if (mask_bits >= (group + 1) * 4) { + bit_offset += f->bb; + bit_offset %= BITS_PER_BYTE; + + if (mask_bits >= (group + 1) * f->bb) { /* Not masked */ pipapo_bucket_set(f, rule, group, v); - } else if (mask_bits <= group * 4) { + } else if (mask_bits <= group * f->bb) { /* Completely masked */ - for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) + for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) pipapo_bucket_set(f, rule, group, i); } else { /* The mask limit falls on this group */ - mask = 0x0f >> (mask_bits - group * 4); - for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) { + mask = GENMASK(f->bb - 1, 0); + mask >>= mask_bits - group * f->bb; + for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) { if ((i & ~mask) == (v & ~mask)) pipapo_bucket_set(f, rule, group, i); } } } + pipapo_lt_bits_adjust(f); + return 1; } @@ -1053,8 +1110,12 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, for_each_possible_cpu(i) { unsigned long *scratch; +#ifdef NFT_PIPAPO_ALIGN + unsigned long *scratch_aligned; +#endif - scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2, + scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2 + + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL, cpu_to_node(i)); if (!scratch) { /* On failure, there's no need to undo previous @@ -1070,6 +1131,11 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, kfree(*per_cpu_ptr(clone->scratch, i)); *per_cpu_ptr(clone->scratch, i) = scratch; + +#ifdef NFT_PIPAPO_ALIGN + scratch_aligned = NFT_PIPAPO_LT_ALIGN(scratch); + *per_cpu_ptr(clone->scratch_aligned, i) = scratch_aligned; +#endif } return 0; @@ -1098,21 +1164,41 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, struct nft_pipapo_field *f; int i, bsize_max, err = 0; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) + end = (const u8 *)nft_set_ext_key_end(ext)->data; + else + end = start; + dup = pipapo_get(net, set, start, genmask); - if (PTR_ERR(dup) == -ENOENT) { - if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) { - end = (const u8 *)nft_set_ext_key_end(ext)->data; - dup = pipapo_get(net, set, end, nft_genmask_next(net)); - } else { - end = start; + if (!IS_ERR(dup)) { + /* Check if we already have the same exact entry */ + const struct nft_data *dup_key, *dup_end; + + dup_key = nft_set_ext_key(&dup->ext); + if (nft_set_ext_exists(&dup->ext, NFT_SET_EXT_KEY_END)) + dup_end = nft_set_ext_key_end(&dup->ext); + else + dup_end = dup_key; + + if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) && + !memcmp(end, dup_end->data, sizeof(*dup_end->data))) { + *ext2 = &dup->ext; + return -EEXIST; } + + return -ENOTEMPTY; + } + + if (PTR_ERR(dup) == -ENOENT) { + /* Look for partially overlapping entries */ + dup = pipapo_get(net, set, end, nft_genmask_next(net)); } if (PTR_ERR(dup) != -ENOENT) { if (IS_ERR(dup)) return PTR_ERR(dup); *ext2 = &dup->ext; - return -EEXIST; + return -ENOTEMPTY; } /* Validate */ @@ -1123,11 +1209,11 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, return -ENOSPC; if (memcmp(start_p, end_p, - f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) > 0) + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) > 0) return -EINVAL; - start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); - end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } /* Insert */ @@ -1141,22 +1227,19 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, rulemap[i].to = f->rules; ret = memcmp(start, end, - f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); - if (!ret) { - ret = pipapo_insert(f, start, - f->groups * NFT_PIPAPO_GROUP_BITS); - } else { - ret = pipapo_expand(f, start, end, - f->groups * NFT_PIPAPO_GROUP_BITS); - } + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); + if (!ret) + ret = pipapo_insert(f, start, f->groups * f->bb); + else + ret = pipapo_expand(f, start, end, f->groups * f->bb); if (f->bsize > bsize_max) bsize_max = f->bsize; rulemap[i].n = ret; - start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); - end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { @@ -1200,23 +1283,35 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) if (!new->scratch) goto out_scratch; +#ifdef NFT_PIPAPO_ALIGN + new->scratch_aligned = alloc_percpu(*new->scratch_aligned); + if (!new->scratch_aligned) + goto out_scratch; +#endif + rcu_head_init(&new->rcu); src = old->f; dst = new->f; for (i = 0; i < old->field_count; i++) { + unsigned long *new_lt; + memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); - dst->lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS * - src->bsize * sizeof(*dst->lt), - GFP_KERNEL); - if (!dst->lt) + new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) * + src->bsize * sizeof(*dst->lt) + + NFT_PIPAPO_ALIGN_HEADROOM, + GFP_KERNEL); + if (!new_lt) goto out_lt; - memcpy(dst->lt, src->lt, + NFT_PIPAPO_LT_ASSIGN(dst, new_lt); + + memcpy(NFT_PIPAPO_LT_ALIGN(new_lt), + NFT_PIPAPO_LT_ALIGN(src->lt), src->bsize * sizeof(*dst->lt) * - src->groups * NFT_PIPAPO_BUCKETS); + src->groups * NFT_PIPAPO_BUCKETS(src->bb)); dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL); if (!dst->mt) @@ -1237,8 +1332,11 @@ out_lt: kvfree(dst->lt); dst--; } - free_percpu(new->scratch); +#ifdef NFT_PIPAPO_ALIGN + free_percpu(new->scratch_aligned); +#endif out_scratch: + free_percpu(new->scratch); kfree(new); return ERR_PTR(-ENOMEM); @@ -1394,9 +1492,10 @@ static void pipapo_drop(struct nft_pipapo_match *m, unsigned long *pos; int b; - pos = f->lt + g * NFT_PIPAPO_BUCKETS * f->bsize; + pos = NFT_PIPAPO_LT_ALIGN(f->lt) + g * + NFT_PIPAPO_BUCKETS(f->bb) * f->bsize; - for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { bitmap_cut(pos, pos, rulemap[i].to, rulemap[i].n, f->bsize * BITS_PER_LONG); @@ -1414,6 +1513,8 @@ static void pipapo_drop(struct nft_pipapo_match *m, ; } f->rules -= rulemap[i].n; + + pipapo_lt_bits_adjust(f); } } @@ -1498,6 +1599,9 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) for_each_possible_cpu(i) kfree(*per_cpu_ptr(m->scratch, i)); +#ifdef NFT_PIPAPO_ALIGN + free_percpu(m->scratch_aligned); +#endif free_percpu(m->scratch); pipapo_free_fields(m); @@ -1690,30 +1794,33 @@ static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule, int rule_count, u8 *left, u8 *right) { + int g, mask_len = 0, bit_offset = 0; u8 *l = left, *r = right; - int g, mask_len = 0; for (g = 0; g < f->groups; g++) { int b, x0, x1; x0 = -1; x1 = -1; - for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { unsigned long *pos; - pos = f->lt + (g * NFT_PIPAPO_BUCKETS + b) * f->bsize; + pos = NFT_PIPAPO_LT_ALIGN(f->lt) + + (g * NFT_PIPAPO_BUCKETS(f->bb) + b) * f->bsize; if (test_bit(first_rule, pos) && x0 == -1) x0 = b; if (test_bit(first_rule + rule_count - 1, pos)) x1 = b; } - if (g % 2) { - *(l++) |= x0 & 0x0f; - *(r++) |= x1 & 0x0f; - } else { - *l |= x0 << 4; - *r |= x1 << 4; + *l |= x0 << (BITS_PER_BYTE - f->bb - bit_offset); + *r |= x1 << (BITS_PER_BYTE - f->bb - bit_offset); + + bit_offset += f->bb; + if (bit_offset >= BITS_PER_BYTE) { + bit_offset %= BITS_PER_BYTE; + l++; + r++; } if (x1 - x0 == 0) @@ -1748,8 +1855,9 @@ static bool pipapo_match_field(struct nft_pipapo_field *f, pipapo_get_boundaries(f, first_rule, rule_count, left, right); - return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) && - !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); + return !memcmp(start, left, + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) && + !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); } /** @@ -1801,8 +1909,8 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, rules_fx = f->mt[start].n; start = f->mt[start].to; - match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); - match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } if (i == m->field_count) { @@ -1885,56 +1993,24 @@ static u64 nft_pipapo_privsize(const struct nlattr * const nla[], } /** - * nft_pipapo_estimate() - Estimate set size, space and lookup complexity - * @desc: Set description, element count and field description used here + * nft_pipapo_estimate() - Set size, space and lookup complexity + * @desc: Set description, element count and field description used * @features: Flags: NFT_SET_INTERVAL needs to be there * @est: Storage for estimation data * - * The size for this set type can vary dramatically, as it depends on the number - * of rules (composing netmasks) the entries expand to. We compute the worst - * case here. - * - * In general, for a non-ranged entry or a single composing netmask, we need - * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that - * is, each input bit needs four bits of matching data), plus a bucket in the - * mapping table for each field. - * - * Return: true only for compatible range concatenations + * Return: true if set description is compatible, false otherwise */ static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { - unsigned long entry_size; - int i; - - if (!(features & NFT_SET_INTERVAL) || desc->field_count <= 1) + if (!(features & NFT_SET_INTERVAL) || + desc->field_count < NFT_PIPAPO_MIN_FIELDS) return false; - for (i = 0, entry_size = 0; i < desc->field_count; i++) { - unsigned long rules; - - if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES) - return false; - - /* Worst-case ranges for each concatenated field: each n-bit - * field can expand to up to n * 2 rules in each bucket, and - * each rule also needs a mapping bucket. - */ - rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2; - entry_size += rules * NFT_PIPAPO_BUCKETS / BITS_PER_BYTE; - entry_size += rules * sizeof(union nft_pipapo_map_bucket); - } - - /* Rules in lookup and mapping tables are needed for each entry */ - est->size = desc->size * entry_size; - if (est->size && div_u64(est->size, desc->size) != entry_size) + est->size = pipapo_estimate_size(desc); + if (!est->size) return false; - est->size += sizeof(struct nft_pipapo) + - sizeof(struct nft_pipapo_match) * 2; - - est->size += sizeof(struct nft_pipapo_field) * desc->field_count; - est->lookup = NFT_SET_CLASS_O_LOG_N; est->space = NFT_SET_CLASS_O_N; @@ -1961,38 +2037,52 @@ static int nft_pipapo_init(const struct nft_set *set, struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; struct nft_pipapo_field *f; - int err, i; + int err, i, field_count; + + field_count = desc->field_count ? : 1; - if (desc->field_count > NFT_PIPAPO_MAX_FIELDS) + if (field_count > NFT_PIPAPO_MAX_FIELDS) return -EINVAL; - m = kmalloc(sizeof(*priv->match) + sizeof(*f) * desc->field_count, + m = kmalloc(sizeof(*priv->match) + sizeof(*f) * field_count, GFP_KERNEL); if (!m) return -ENOMEM; - m->field_count = desc->field_count; + m->field_count = field_count; m->bsize_max = 0; m->scratch = alloc_percpu(unsigned long *); if (!m->scratch) { err = -ENOMEM; - goto out_free; + goto out_scratch; } for_each_possible_cpu(i) *per_cpu_ptr(m->scratch, i) = NULL; +#ifdef NFT_PIPAPO_ALIGN + m->scratch_aligned = alloc_percpu(unsigned long *); + if (!m->scratch_aligned) { + err = -ENOMEM; + goto out_free; + } + for_each_possible_cpu(i) + *per_cpu_ptr(m->scratch_aligned, i) = NULL; +#endif + rcu_head_init(&m->rcu); nft_pipapo_for_each_field(f, i, m) { - f->groups = desc->field_len[i] * NFT_PIPAPO_GROUPS_PER_BYTE; - priv->groups += f->groups; + int len = desc->field_len[i] ? : set->klen; - priv->width += round_up(desc->field_len[i], sizeof(u32)); + f->bb = NFT_PIPAPO_GROUP_BITS_INIT; + f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f); + + priv->width += round_up(len, sizeof(u32)); f->bsize = 0; f->rules = 0; - f->lt = NULL; + NFT_PIPAPO_LT_ASSIGN(f, NULL); f->mt = NULL; } @@ -2010,7 +2100,11 @@ static int nft_pipapo_init(const struct nft_set *set, return 0; out_free: +#ifdef NFT_PIPAPO_ALIGN + free_percpu(m->scratch_aligned); +#endif free_percpu(m->scratch); +out_scratch: kfree(m); return err; @@ -2045,16 +2139,21 @@ static void nft_pipapo_destroy(const struct nft_set *set) nft_set_elem_destroy(set, e, true); } +#ifdef NFT_PIPAPO_ALIGN + free_percpu(m->scratch_aligned); +#endif for_each_possible_cpu(cpu) kfree(*per_cpu_ptr(m->scratch, cpu)); free_percpu(m->scratch); - pipapo_free_fields(m); kfree(m); priv->match = NULL; } if (priv->clone) { +#ifdef NFT_PIPAPO_ALIGN + free_percpu(priv->clone->scratch_aligned); +#endif for_each_possible_cpu(cpu) kfree(*per_cpu_ptr(priv->clone->scratch, cpu)); free_percpu(priv->clone->scratch); @@ -2081,8 +2180,7 @@ static void nft_pipapo_gc_init(const struct nft_set *set) priv->last_gc = jiffies; } -struct nft_set_type nft_set_pipapo_type __read_mostly = { - .owner = THIS_MODULE, +const struct nft_set_type nft_set_pipapo_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { @@ -2102,3 +2200,26 @@ struct nft_set_type nft_set_pipapo_type __read_mostly = { .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; + +#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2) +const struct nft_set_type nft_set_pipapo_avx2_type = { + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | + NFT_SET_TIMEOUT, + .ops = { + .lookup = nft_pipapo_avx2_lookup, + .insert = nft_pipapo_insert, + .activate = nft_pipapo_activate, + .deactivate = nft_pipapo_deactivate, + .flush = nft_pipapo_flush, + .remove = nft_pipapo_remove, + .walk = nft_pipapo_walk, + .get = nft_pipapo_get, + .privsize = nft_pipapo_privsize, + .estimate = nft_pipapo_avx2_estimate, + .init = nft_pipapo_init, + .destroy = nft_pipapo_destroy, + .gc_init = nft_pipapo_gc_init, + .elemsize = offsetof(struct nft_pipapo_elem, ext), + }, +}; +#endif diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h new file mode 100644 index 000000000000..25a75591583e --- /dev/null +++ b/net/netfilter/nft_set_pipapo.h @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef _NFT_SET_PIPAPO_H + +#include <linux/log2.h> +#include <net/ipv6.h> /* For the maximum length of a field */ + +/* Count of concatenated fields depends on count of 32-bit nftables registers */ +#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT + +/* Restrict usage to multiple fields, make sure rbtree is used otherwise */ +#define NFT_PIPAPO_MIN_FIELDS 2 + +/* Largest supported field size */ +#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr)) +#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE) + +/* Bits to be grouped together in table buckets depending on set size */ +#define NFT_PIPAPO_GROUP_BITS_INIT NFT_PIPAPO_GROUP_BITS_SMALL_SET +#define NFT_PIPAPO_GROUP_BITS_SMALL_SET 8 +#define NFT_PIPAPO_GROUP_BITS_LARGE_SET 4 +#define NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4 \ + BUILD_BUG_ON((NFT_PIPAPO_GROUP_BITS_SMALL_SET != 8) || \ + (NFT_PIPAPO_GROUP_BITS_LARGE_SET != 4)) +#define NFT_PIPAPO_GROUPS_PER_BYTE(f) (BITS_PER_BYTE / (f)->bb) + +/* If a lookup table gets bigger than NFT_PIPAPO_LT_SIZE_HIGH, switch to the + * small group width, and switch to the big group width if the table gets + * smaller than NFT_PIPAPO_LT_SIZE_LOW. + * + * Picking 2MiB as threshold (for a single table) avoids as much as possible + * crossing page boundaries on most architectures (x86-64 and MIPS huge pages, + * ARMv7 supersections, POWER "large" pages, SPARC Level 1 regions, etc.), which + * keeps performance nice in case kvmalloc() gives us non-contiguous areas. + */ +#define NFT_PIPAPO_LT_SIZE_THRESHOLD (1 << 21) +#define NFT_PIPAPO_LT_SIZE_HYSTERESIS (1 << 16) +#define NFT_PIPAPO_LT_SIZE_HIGH NFT_PIPAPO_LT_SIZE_THRESHOLD +#define NFT_PIPAPO_LT_SIZE_LOW NFT_PIPAPO_LT_SIZE_THRESHOLD - \ + NFT_PIPAPO_LT_SIZE_HYSTERESIS + +/* Fields are padded to 32 bits in input registers */ +#define NFT_PIPAPO_GROUPS_PADDED_SIZE(f) \ + (round_up((f)->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f), sizeof(u32))) +#define NFT_PIPAPO_GROUPS_PADDING(f) \ + (NFT_PIPAPO_GROUPS_PADDED_SIZE(f) - (f)->groups / \ + NFT_PIPAPO_GROUPS_PER_BYTE(f)) + +/* Number of buckets given by 2 ^ n, with n bucket bits */ +#define NFT_PIPAPO_BUCKETS(bb) (1 << (bb)) + +/* Each n-bit range maps to up to n * 2 rules */ +#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2)) + +/* Use the rest of mapping table buckets for rule indices, but it makes no sense + * to exceed 32 bits + */ +#if BITS_PER_LONG == 64 +#define NFT_PIPAPO_MAP_TOBITS 32 +#else +#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS) +#endif + +/* ...which gives us the highest allowed index for a rule */ +#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \ + - (1UL << NFT_PIPAPO_MAP_NBITS)) + +/* Definitions for vectorised implementations */ +#ifdef NFT_PIPAPO_ALIGN +#define NFT_PIPAPO_ALIGN_HEADROOM \ + (NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN) +#define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN)) +#define NFT_PIPAPO_LT_ASSIGN(field, x) \ + do { \ + (field)->lt_aligned = NFT_PIPAPO_LT_ALIGN(x); \ + (field)->lt = (x); \ + } while (0) +#else +#define NFT_PIPAPO_ALIGN_HEADROOM 0 +#define NFT_PIPAPO_LT_ALIGN(lt) (lt) +#define NFT_PIPAPO_LT_ASSIGN(field, x) ((field)->lt = (x)) +#endif /* NFT_PIPAPO_ALIGN */ + +#define nft_pipapo_for_each_field(field, index, match) \ + for ((field) = (match)->f, (index) = 0; \ + (index) < (match)->field_count; \ + (index)++, (field)++) + +/** + * union nft_pipapo_map_bucket - Bucket of mapping table + * @to: First rule number (in next field) this rule maps to + * @n: Number of rules (in next field) this rule maps to + * @e: If there's no next field, pointer to element this rule maps to + */ +union nft_pipapo_map_bucket { + struct { +#if BITS_PER_LONG == 64 + static_assert(NFT_PIPAPO_MAP_TOBITS <= 32); + u32 to; + + static_assert(NFT_PIPAPO_MAP_NBITS <= 32); + u32 n; +#else + unsigned long to:NFT_PIPAPO_MAP_TOBITS; + unsigned long n:NFT_PIPAPO_MAP_NBITS; +#endif + }; + struct nft_pipapo_elem *e; +}; + +/** + * struct nft_pipapo_field - Lookup, mapping tables and related data for a field + * @groups: Amount of bit groups + * @rules: Number of inserted rules + * @bsize: Size of each bucket in lookup table, in longs + * @bb: Number of bits grouped together in lookup table buckets + * @lt: Lookup table: 'groups' rows of buckets + * @lt_aligned: Version of @lt aligned to NFT_PIPAPO_ALIGN bytes + * @mt: Mapping table: one bucket per rule + */ +struct nft_pipapo_field { + int groups; + unsigned long rules; + size_t bsize; + int bb; +#ifdef NFT_PIPAPO_ALIGN + unsigned long *lt_aligned; +#endif + unsigned long *lt; + union nft_pipapo_map_bucket *mt; +}; + +/** + * struct nft_pipapo_match - Data used for lookup and matching + * @field_count Amount of fields in set + * @scratch: Preallocated per-CPU maps for partial matching results + * @scratch_aligned: Version of @scratch aligned to NFT_PIPAPO_ALIGN bytes + * @bsize_max: Maximum lookup table bucket size of all fields, in longs + * @rcu Matching data is swapped on commits + * @f: Fields, with lookup and mapping tables + */ +struct nft_pipapo_match { + int field_count; +#ifdef NFT_PIPAPO_ALIGN + unsigned long * __percpu *scratch_aligned; +#endif + unsigned long * __percpu *scratch; + size_t bsize_max; + struct rcu_head rcu; + struct nft_pipapo_field f[]; +}; + +/** + * struct nft_pipapo - Representation of a set + * @match: Currently in-use matching data + * @clone: Copy where pending insertions and deletions are kept + * @width: Total bytes to be matched for one packet, including padding + * @dirty: Working copy has pending insertions or deletions + * @last_gc: Timestamp of last garbage collection run, jiffies + */ +struct nft_pipapo { + struct nft_pipapo_match __rcu *match; + struct nft_pipapo_match *clone; + int width; + bool dirty; + unsigned long last_gc; +}; + +struct nft_pipapo_elem; + +/** + * struct nft_pipapo_elem - API-facing representation of single set element + * @ext: nftables API extensions + */ +struct nft_pipapo_elem { + struct nft_set_ext ext; +}; + +int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst, + union nft_pipapo_map_bucket *mt, bool match_only); + +/** + * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets + * @f: Field including lookup table + * @dst: Area to store result + * @data: Input data selecting table buckets + */ +static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f, + unsigned long *dst, + const u8 *data) +{ + unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt); + int group; + + for (group = 0; group < f->groups; group += BITS_PER_BYTE / 4, data++) { + u8 v; + + v = *data >> 4; + __bitmap_and(dst, dst, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS(4); + + v = *data & 0x0f; + __bitmap_and(dst, dst, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS(4); + } +} + +/** + * pipapo_and_field_buckets_8bit() - Intersect 8-bit buckets + * @f: Field including lookup table + * @dst: Area to store result + * @data: Input data selecting table buckets + */ +static inline void pipapo_and_field_buckets_8bit(struct nft_pipapo_field *f, + unsigned long *dst, + const u8 *data) +{ + unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt); + int group; + + for (group = 0; group < f->groups; group++, data++) { + __bitmap_and(dst, dst, lt + *data * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS(8); + } +} + +/** + * pipapo_estimate_size() - Estimate worst-case for set size + * @desc: Set description, element count and field description used here + * + * The size for this set type can vary dramatically, as it depends on the number + * of rules (composing netmasks) the entries expand to. We compute the worst + * case here. + * + * In general, for a non-ranged entry or a single composing netmask, we need + * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that + * is, each input bit needs four bits of matching data), plus a bucket in the + * mapping table for each field. + * + * Return: worst-case set size in bytes, 0 on any overflow + */ +static u64 pipapo_estimate_size(const struct nft_set_desc *desc) +{ + unsigned long entry_size; + u64 size; + int i; + + for (i = 0, entry_size = 0; i < desc->field_count; i++) { + unsigned long rules; + + if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES) + return 0; + + /* Worst-case ranges for each concatenated field: each n-bit + * field can expand to up to n * 2 rules in each bucket, and + * each rule also needs a mapping bucket. + */ + rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2; + entry_size += rules * + NFT_PIPAPO_BUCKETS(NFT_PIPAPO_GROUP_BITS_INIT) / + BITS_PER_BYTE; + entry_size += rules * sizeof(union nft_pipapo_map_bucket); + } + + /* Rules in lookup and mapping tables are needed for each entry */ + size = desc->size * entry_size; + if (size && div_u64(size, desc->size) != entry_size) + return 0; + + size += sizeof(struct nft_pipapo) + sizeof(struct nft_pipapo_match) * 2; + + size += sizeof(struct nft_pipapo_field) * desc->field_count; + + return size; +} + +#endif /* _NFT_SET_PIPAPO_H */ diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c new file mode 100644 index 000000000000..d65ae0e23028 --- /dev/null +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -0,0 +1,1223 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines + * + * Copyright (c) 2019-2020 Red Hat GmbH + * + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <uapi/linux/netfilter/nf_tables.h> +#include <linux/bitmap.h> +#include <linux/bitops.h> + +#include <linux/compiler.h> +#include <asm/fpu/api.h> + +#include "nft_set_pipapo_avx2.h" +#include "nft_set_pipapo.h" + +#define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG) + +/* Load from memory into YMM register with non-temporal hint ("stream load"), + * that is, don't fetch lines from memory into the cache. This avoids pushing + * precious packet data out of the cache hierarchy, and is appropriate when: + * + * - loading buckets from lookup tables, as they are not going to be used + * again before packets are entirely classified + * + * - loading the result bitmap from the previous field, as it's never used + * again + */ +#define NFT_PIPAPO_AVX2_LOAD(reg, loc) \ + asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc)) + +/* Stream a single lookup table bucket into YMM register given lookup table, + * group index, value of packet bits, bucket size. + */ +#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \ + NFT_PIPAPO_AVX2_LOAD(reg, \ + lt[((group) * NFT_PIPAPO_BUCKETS(4) + \ + (v)) * (bsize)]) +#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \ + NFT_PIPAPO_AVX2_LOAD(reg, \ + lt[((group) * NFT_PIPAPO_BUCKETS(8) + \ + (v)) * (bsize)]) + +/* Bitwise AND: the staple operation of this algorithm */ +#define NFT_PIPAPO_AVX2_AND(dst, a, b) \ + asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst) + +/* Jump to label if @reg is zero */ +#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \ + asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ + "je %l[" #label "]" : : : : label) + +/* Store 256 bits from YMM register into memory. Contrary to bucket load + * operation, we don't bypass the cache here, as stored matching results + * are always used shortly after. + */ +#define NFT_PIPAPO_AVX2_STORE(loc, reg) \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc)) + +/* Zero out a complete YMM register, @reg */ +#define NFT_PIPAPO_AVX2_ZERO(reg) \ + asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg) + +/* Current working bitmap index, toggled between field matches */ +static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index); + +/** + * nft_pipapo_avx2_prepare() - Prepare before main algorithm body + * + * This zeroes out ymm15, which is later used whenever we need to clear a + * memory location, by storing its content into memory. + */ +static void nft_pipapo_avx2_prepare(void) +{ + NFT_PIPAPO_AVX2_ZERO(15); +} + +/** + * nft_pipapo_avx2_fill() - Fill a bitmap region with ones + * @data: Base memory area + * @start: First bit to set + * @len: Count of bits to fill + * + * This is nothing else than a version of bitmap_set(), as used e.g. by + * pipapo_refill(), tailored for the microarchitectures using it and better + * suited for the specific usage: it's very likely that we'll set a small number + * of bits, not crossing a word boundary, and correct branch prediction is + * critical here. + * + * This function doesn't actually use any AVX2 instruction. + */ +static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len) +{ + int offset = start % BITS_PER_LONG; + unsigned long mask; + + data += start / BITS_PER_LONG; + + if (likely(len == 1)) { + *data |= BIT(offset); + return; + } + + if (likely(len < BITS_PER_LONG || offset)) { + if (likely(len + offset <= BITS_PER_LONG)) { + *data |= GENMASK(len - 1 + offset, offset); + return; + } + + *data |= ~0UL << offset; + len -= BITS_PER_LONG - offset; + data++; + + if (len <= BITS_PER_LONG) { + mask = ~0UL >> (BITS_PER_LONG - len); + *data |= mask; + return; + } + } + + memset(data, 0xff, len / BITS_PER_BYTE); + data += len / BITS_PER_LONG; + + len %= BITS_PER_LONG; + if (len) + *data |= ~0UL >> (BITS_PER_LONG - len); +} + +/** + * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits + * @offset: Start from given bitmap (equivalent to bucket) offset, in longs + * @map: Bitmap to be scanned for set bits + * @dst: Destination bitmap + * @mt: Mapping table containing bit set specifiers + * @len: Length of bitmap in longs + * @last: Return index of first set bit, if this is the last field + * + * This is an alternative implementation of pipapo_refill() suitable for usage + * with AVX2 lookup routines: we know there are four words to be scanned, at + * a given offset inside the map, for each matching iteration. + * + * This function doesn't actually use any AVX2 instruction. + * + * Return: first set bit index if @last, index of first filled word otherwise. + */ +static int nft_pipapo_avx2_refill(int offset, unsigned long *map, + unsigned long *dst, + union nft_pipapo_map_bucket *mt, bool last) +{ + int ret = -1; + +#define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x) \ + do { \ + while (map[(x)]) { \ + int r = __builtin_ctzl(map[(x)]); \ + int i = (offset + (x)) * BITS_PER_LONG + r; \ + \ + if (last) \ + return i; \ + \ + nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n); \ + \ + if (ret == -1) \ + ret = mt[i].to; \ + \ + map[(x)] &= ~(1UL << r); \ + } \ + } while (0) + + NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0); + NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1); + NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2); + NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3); +#undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * Load buckets from lookup table corresponding to the values of each 4-bit + * group of packet bytes, and perform a bitwise intersection between them. If + * this is the first field in the set, simply AND the buckets together + * (equivalent to using an all-ones starting bitmap), use the provided starting + * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next + * working bitmap, @fill. + * + * This is used for 8-bit fields (i.e. protocol numbers). + * + * Out-of-order (and superscalar) execution is vital here, so it's critical to + * avoid false data dependencies. CPU and compiler could (mostly) take care of + * this on their own, but the operation ordering is explicitly given here with + * a likely execution order in mind, to highlight possible stalls. That's why + * a number of logically distinct operations (i.e. loading buckets, intersecting + * buckets) are interleaved. + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf }; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + } else { + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing); + NFT_PIPAPO_AVX2_AND(3, 0, 1); + NFT_PIPAPO_AVX2_AND(4, 2, 3); + } + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 16-bit fields (i.e. ports). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf }; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + NFT_PIPAPO_AVX2_AND(5, 2, 3); + NFT_PIPAPO_AVX2_AND(7, 4, 5); + } else { + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); + + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); + + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); + NFT_PIPAPO_AVX2_AND(5, 0, 1); + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); + + NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_AND(7, 4, 5); + /* Stall */ + NFT_PIPAPO_AVX2_AND(7, 6, 7); + } + + /* Stall */ + NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 7); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 32-bit fields (i.e. IPv4 addresses). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, + pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, + }; + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 4, pg[4], bsize); + NFT_PIPAPO_AVX2_AND(5, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 5, pg[5], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 6, pg[6], bsize); + NFT_PIPAPO_AVX2_AND(8, 2, 3); + NFT_PIPAPO_AVX2_AND(9, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize); + NFT_PIPAPO_AVX2_AND(11, 6, 7); + NFT_PIPAPO_AVX2_AND(12, 8, 9); + NFT_PIPAPO_AVX2_AND(13, 10, 11); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(1, 12, 13); + } else { + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); + + NFT_PIPAPO_AVX2_AND(5, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); + NFT_PIPAPO_AVX2_AND(8, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize); + NFT_PIPAPO_AVX2_AND(10, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize); + NFT_PIPAPO_AVX2_AND(12, 6, 7); + NFT_PIPAPO_AVX2_AND(13, 8, 9); + NFT_PIPAPO_AVX2_AND(14, 10, 11); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(1, 12, 13); + NFT_PIPAPO_AVX2_AND(1, 1, 14); + } + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 1); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; + +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 48-bit fields (i.e. MAC addresses/EUI-48). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, + pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, + pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf, + }; + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (!first) + NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); + + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); + + if (!first) { + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); + NFT_PIPAPO_AVX2_AND(1, 1, 0); + } + + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 4, pg[4], bsize); + NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 6, pg[6], bsize); + NFT_PIPAPO_AVX2_AND(9, 1, 4); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize); + NFT_PIPAPO_AVX2_AND(11, 5, 6); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 8, pg[8], bsize); + NFT_PIPAPO_AVX2_AND(13, 7, 8); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 9, pg[9], bsize); + + NFT_PIPAPO_AVX2_AND(0, 9, 10); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 10, pg[10], bsize); + NFT_PIPAPO_AVX2_AND(2, 11, 12); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize); + NFT_PIPAPO_AVX2_AND(4, 13, 14); + NFT_PIPAPO_AVX2_AND(5, 0, 1); + + NFT_PIPAPO_AVX2_AND(6, 2, 3); + + /* Stalls */ + NFT_PIPAPO_AVX2_AND(7, 4, 5); + NFT_PIPAPO_AVX2_AND(8, 6, 7); + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 8); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 128-bit fields (i.e. IPv6 addresses). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, + pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, + pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf, + pkt[6] >> 4, pkt[6] & 0xf, pkt[7] >> 4, pkt[7] & 0xf, + pkt[8] >> 4, pkt[8] & 0xf, pkt[9] >> 4, pkt[9] & 0xf, + pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf, + pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf, + pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf, + }; + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (!first) + NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); + + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); + if (!first) { + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); + NFT_PIPAPO_AVX2_AND(1, 1, 0); + } + + NFT_PIPAPO_AVX2_AND(5, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); + NFT_PIPAPO_AVX2_AND(8, 1, 4); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize); + NFT_PIPAPO_AVX2_AND(10, 5, 6); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize); + NFT_PIPAPO_AVX2_AND(12, 7, 8); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt, 8, pg[8], bsize); + NFT_PIPAPO_AVX2_AND(14, 9, 10); + + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 9, pg[9], bsize); + NFT_PIPAPO_AVX2_AND(1, 11, 12); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 10, pg[10], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize); + NFT_PIPAPO_AVX2_AND(4, 13, 14); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 12, pg[12], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 13, pg[13], bsize); + NFT_PIPAPO_AVX2_AND(7, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 14, pg[14], bsize); + NFT_PIPAPO_AVX2_AND(9, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize); + NFT_PIPAPO_AVX2_AND(11, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize); + NFT_PIPAPO_AVX2_AND(13, 6, 7); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize); + + NFT_PIPAPO_AVX2_AND(0, 8, 9); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 18, pg[18], bsize); + NFT_PIPAPO_AVX2_AND(2, 10, 11); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 19, pg[19], bsize); + NFT_PIPAPO_AVX2_AND(4, 12, 13); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 20, pg[20], bsize); + NFT_PIPAPO_AVX2_AND(6, 14, 0); + NFT_PIPAPO_AVX2_AND(7, 1, 2); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 21, pg[21], bsize); + NFT_PIPAPO_AVX2_AND(9, 3, 4); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize); + NFT_PIPAPO_AVX2_AND(11, 5, 6); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize); + NFT_PIPAPO_AVX2_AND(13, 7, 8); + + NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 25, pg[25], bsize); + NFT_PIPAPO_AVX2_AND(1, 9, 10); + NFT_PIPAPO_AVX2_AND(2, 11, 12); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 26, pg[26], bsize); + NFT_PIPAPO_AVX2_AND(4, 13, 14); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 27, pg[27], bsize); + NFT_PIPAPO_AVX2_AND(6, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 28, pg[28], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 29, pg[29], bsize); + NFT_PIPAPO_AVX2_AND(9, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize); + NFT_PIPAPO_AVX2_AND(11, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize); + + NFT_PIPAPO_AVX2_AND(0, 6, 7); + NFT_PIPAPO_AVX2_AND(1, 8, 9); + NFT_PIPAPO_AVX2_AND(2, 10, 11); + NFT_PIPAPO_AVX2_AND(3, 12, 0); + + /* Stalls */ + NFT_PIPAPO_AVX2_AND(4, 1, 2); + NFT_PIPAPO_AVX2_AND(5, 3, 4); + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 5); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 8-bit fields (i.e. protocol numbers). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize); + } else { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); + NFT_PIPAPO_AVX2_AND(2, 0, 1); + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); + } + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 2); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 16-bit fields (i.e. ports). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + } else { + NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(3, 0, 1); + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); + NFT_PIPAPO_AVX2_AND(4, 3, 2); + } + + /* Stall */ + NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 32-bit fields (i.e. IPv4 addresses). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(4, 0, 1); + NFT_PIPAPO_AVX2_AND(5, 2, 3); + NFT_PIPAPO_AVX2_AND(0, 4, 5); + } else { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); + + NFT_PIPAPO_AVX2_AND(5, 0, 1); + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); + NFT_PIPAPO_AVX2_AND(6, 2, 3); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(7, 4, 5); + NFT_PIPAPO_AVX2_AND(0, 6, 7); + } + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 0); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; + +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 48-bit fields (i.e. MAC addresses/EUI-48). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (first) { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize); + + NFT_PIPAPO_AVX2_AND(5, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 6, pkt[5], bsize); + NFT_PIPAPO_AVX2_AND(7, 2, 3); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_AND(1, 6, 7); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + } else { + NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); + + NFT_PIPAPO_AVX2_AND(5, 0, 1); + NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); + + NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 4, pkt[4], bsize); + NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 5, pkt[5], bsize); + NFT_PIPAPO_AVX2_AND(2, 6, 7); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(3, 0, 1); + NFT_PIPAPO_AVX2_AND(4, 2, 3); + } + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; + +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * See nft_pipapo_avx2_lookup_4b_2(). + * + * This is used for 128-bit fields (i.e. IPv6 addresses). + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; + unsigned long *lt = f->lt, bsize = f->bsize; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { + int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; + + if (!first) + NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); + + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); + if (!first) { + NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); + NFT_PIPAPO_AVX2_AND(1, 1, 0); + } + NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); + + NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 4, pkt[4], bsize); + NFT_PIPAPO_AVX2_AND(6, 1, 2); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 5, pkt[5], bsize); + NFT_PIPAPO_AVX2_AND(0, 3, 4); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 6, pkt[6], bsize); + + NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 7, pkt[7], bsize); + NFT_PIPAPO_AVX2_AND(3, 5, 6); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize); + + NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize); + NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize); + NFT_PIPAPO_AVX2_AND(2, 6, 7); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize); + NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize); + NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize); + NFT_PIPAPO_AVX2_AND(2, 6, 7); + NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize); + NFT_PIPAPO_AVX2_AND(4, 0, 1); + + /* Stall */ + NFT_PIPAPO_AVX2_AND(5, 2, 3); + NFT_PIPAPO_AVX2_AND(6, 4, 5); + + NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch); + NFT_PIPAPO_AVX2_STORE(map[i_ul], 6); + + b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); + if (last) + return b; + + if (unlikely(ret == -1)) + ret = b / XSAVE_YMM_SIZE; + + continue; + +nomatch: + NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); +nothing: + ; + } + + return ret; +} + +/** + * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes + * @map: Previous match result, used as initial bitmap + * @fill: Destination bitmap to be filled with current match result + * @f: Field, containing lookup and mapping tables + * @offset: Ignore buckets before the given index, no bits are filled there + * @pkt: Packet data, pointer to input nftables register + * @first: If this is the first field, don't source previous result + * @last: Last field: stop at the first match and return bit index + * + * This function should never be called, but is provided for the case the field + * size doesn't match any of the known data types. Matching rate is + * substantially lower than AVX2 routines. + * + * Return: -1 on no match, rule index of match if @last, otherwise first long + * word index to be checked next (i.e. first filled word). + */ +static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill, + struct nft_pipapo_field *f, int offset, + const u8 *pkt, bool first, bool last) +{ + unsigned long *lt = f->lt, bsize = f->bsize; + int i, ret = -1, b; + + lt += offset * NFT_PIPAPO_LONGS_PER_M256; + + if (first) + memset(map, 0xff, bsize * sizeof(*map)); + + for (i = offset; i < bsize; i++) { + if (f->bb == 8) + pipapo_and_field_buckets_8bit(f, map, pkt); + else + pipapo_and_field_buckets_4bit(f, map, pkt); + NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; + + b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last); + + if (last) + return b; + + if (ret == -1) + ret = b / XSAVE_YMM_SIZE; + } + + return ret; +} + +/** + * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity + * @desc: Set description, element count and field description used + * @features: Flags: NFT_SET_INTERVAL needs to be there + * @est: Storage for estimation data + * + * Return: true if set is compatible and AVX2 available, false otherwise. + */ +bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + if (!(features & NFT_SET_INTERVAL) || + desc->field_count < NFT_PIPAPO_MIN_FIELDS) + return false; + + if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX)) + return false; + + est->size = pipapo_estimate_size(desc); + if (!est->size) + return false; + + est->lookup = NFT_SET_CLASS_O_LOG_N; + + est->space = NFT_SET_CLASS_O_N; + + return true; +} + +/** + * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @ext: nftables API extension pointer, filled with matching reference + * + * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. + * + * This implementation exploits the repetitive characteristic of the algorithm + * to provide a fast, vectorised version using the AVX2 SIMD instruction set. + * + * Return: true on match, false otherwise. + */ +bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + struct nft_pipapo *priv = nft_set_priv(set); + unsigned long *res, *fill, *scratch; + u8 genmask = nft_genmask_cur(net); + const u8 *rp = (const u8 *)key; + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + bool map_index; + int i, ret = 0; + + m = rcu_dereference(priv->match); + + /* This also protects access to all data related to scratch maps */ + kernel_fpu_begin(); + + scratch = *raw_cpu_ptr(m->scratch_aligned); + if (unlikely(!scratch)) { + kernel_fpu_end(); + return false; + } + map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index); + + res = scratch + (map_index ? m->bsize_max : 0); + fill = scratch + (map_index ? 0 : m->bsize_max); + + /* Starting map doesn't need to be set for this implementation */ + + nft_pipapo_avx2_prepare(); + +next_match: + nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1, first = !i; + +#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ + (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ + ret, rp, \ + first, last)) + + if (likely(f->bb == 8)) { + if (f->groups == 1) { + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1); + } else if (f->groups == 2) { + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2); + } else if (f->groups == 4) { + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4); + } else if (f->groups == 6) { + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6); + } else if (f->groups == 16) { + NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); + } else { + ret = nft_pipapo_avx2_lookup_slow(res, fill, f, + ret, rp, + first, last); + } + } else { + if (f->groups == 2) { + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2); + } else if (f->groups == 4) { + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4); + } else if (f->groups == 8) { + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8); + } else if (f->groups == 12) { + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12); + } else if (f->groups == 32) { + NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); + } else { + ret = nft_pipapo_avx2_lookup_slow(res, fill, f, + ret, rp, + first, last); + } + } + NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; + +#undef NFT_SET_PIPAPO_AVX2_LOOKUP + + if (ret < 0) + goto out; + + if (last) { + *ext = &f->mt[ret].e->ext; + if (unlikely(nft_set_elem_expired(*ext) || + !nft_set_elem_active(*ext, genmask))) { + ret = 0; + goto next_match; + } + + goto out; + } + + swap(res, fill); + rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + } + +out: + if (i % 2) + raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index); + kernel_fpu_end(); + + return ret >= 0; +} diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h new file mode 100644 index 000000000000..396caf7bfca8 --- /dev/null +++ b/net/netfilter/nft_set_pipapo_avx2.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _NFT_SET_PIPAPO_AVX2_H + +#ifdef CONFIG_AS_AVX2 +#include <asm/fpu/xstate.h> +#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE) + +bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext); +bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est); +#endif /* CONFIG_AS_AVX2 */ + +#endif /* _NFT_SET_PIPAPO_AVX2_H */ diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 5000b938ab1e..3a5552e14f75 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -33,6 +33,11 @@ static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); } +static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) +{ + return !nft_rbtree_interval_end(rbe); +} + static bool nft_rbtree_equal(const struct nft_set *set, const void *this, const struct nft_rbtree_elem *interval) { @@ -64,7 +69,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (interval && nft_rbtree_equal(set, this, interval) && nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(interval)) + nft_rbtree_interval_start(interval)) continue; interval = rbe; } else if (d > 0) @@ -89,7 +94,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && - !nft_rbtree_interval_end(interval)) { + nft_rbtree_interval_start(interval)) { *ext = &interval->ext; return true; } @@ -208,8 +213,43 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, u8 genmask = nft_genmask_next(net); struct nft_rbtree_elem *rbe; struct rb_node *parent, **p; + bool overlap = false; int d; + /* Detect overlaps as we descend the tree. Set the flag in these cases: + * + * a1. |__ _ _? >|__ _ _ (insert start after existing start) + * a2. _ _ __>| ?_ _ __| (insert end before existing end) + * a3. _ _ ___| ?_ _ _>| (insert end after existing end) + * a4. >|__ _ _ _ _ __| (insert start before existing end) + * + * and clear it later on, as we eventually reach the points indicated by + * '?' above, in the cases described below. We'll always meet these + * later, locally, due to tree ordering, and overlaps for the intervals + * that are the closest together are always evaluated last. + * + * b1. |__ _ _! >|__ _ _ (insert start after existing end) + * b2. _ _ __>| !_ _ __| (insert end before existing start) + * b3. !_____>| (insert end after existing start) + * + * Case a4. resolves to b1.: + * - if the inserted start element is the leftmost, because the '0' + * element in the tree serves as end element + * - otherwise, if an existing end is found. Note that end elements are + * always inserted after corresponding start elements. + * + * For a new, rightmost pair of elements, we'll hit cases b1. and b3., + * in that order. + * + * The flag is also cleared in two special cases: + * + * b4. |__ _ _!|<_ _ _ (insert start right before existing end) + * b5. |__ _ >|!__ _ _ (insert end right after existing start) + * + * which always happen as last step and imply that no further + * overlapping is possible. + */ + parent = NULL; p = &priv->root.rb_node; while (*p != NULL) { @@ -218,17 +258,42 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, d = memcmp(nft_set_ext_key(&rbe->ext), nft_set_ext_key(&new->ext), set->klen); - if (d < 0) + if (d < 0) { p = &parent->rb_left; - else if (d > 0) + + if (nft_rbtree_interval_start(new)) { + overlap = nft_rbtree_interval_start(rbe) && + nft_set_elem_active(&rbe->ext, + genmask); + } else { + overlap = nft_rbtree_interval_end(rbe) && + nft_set_elem_active(&rbe->ext, + genmask); + } + } else if (d > 0) { p = &parent->rb_right; - else { + + if (nft_rbtree_interval_end(new)) { + overlap = nft_rbtree_interval_end(rbe) && + nft_set_elem_active(&rbe->ext, + genmask); + } else if (nft_rbtree_interval_end(rbe) && + nft_set_elem_active(&rbe->ext, genmask)) { + overlap = true; + } + } else { if (nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(new)) { + nft_rbtree_interval_start(new)) { p = &parent->rb_left; - } else if (!nft_rbtree_interval_end(rbe) && + + if (nft_set_elem_active(&rbe->ext, genmask)) + overlap = false; + } else if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_end(new)) { p = &parent->rb_right; + + if (nft_set_elem_active(&rbe->ext, genmask)) + overlap = false; } else if (nft_set_elem_active(&rbe->ext, genmask)) { *ext = &rbe->ext; return -EEXIST; @@ -237,6 +302,10 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, } } } + + if (overlap) + return -ENOTEMPTY; + rb_link_node_rcu(&new->node, parent, p); rb_insert_color(&new->node, &priv->root); return 0; @@ -317,10 +386,10 @@ static void *nft_rbtree_deactivate(const struct net *net, parent = parent->rb_right; else { if (nft_rbtree_interval_end(rbe) && - !nft_rbtree_interval_end(this)) { + nft_rbtree_interval_start(this)) { parent = parent->rb_left; continue; - } else if (!nft_rbtree_interval_end(rbe) && + } else if (nft_rbtree_interval_start(rbe) && nft_rbtree_interval_end(this)) { parent = parent->rb_right; continue; @@ -481,8 +550,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } -struct nft_set_type nft_set_rbtree_type __read_mostly = { - .owner = THIS_MODULE, +const struct nft_set_type nft_set_rbtree_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { .privsize = nft_rbtree_privsize, diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 764e88682a81..30be5787fbde 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -11,6 +11,7 @@ #include <net/ip_tunnels.h> #include <net/vxlan.h> #include <net/erspan.h> +#include <net/geneve.h> struct nft_tunnel { enum nft_tunnel_keys key:8; @@ -144,6 +145,7 @@ struct nft_tunnel_opts { union { struct vxlan_metadata vxlan; struct erspan_metadata erspan; + u8 data[IP_TUNNEL_OPTS_MAX]; } u; u32 len; __be16 flags; @@ -301,9 +303,53 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, return 0; } +static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = { + [NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 }, + [NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 }, + [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, +}; + +static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, + struct nft_tunnel_opts *opts) +{ + struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len; + struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1]; + int err, data_len; + + err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_GENEVE_MAX, attr, + nft_tunnel_opts_geneve_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_TUNNEL_KEY_GENEVE_CLASS] || + !tb[NFTA_TUNNEL_KEY_GENEVE_TYPE] || + !tb[NFTA_TUNNEL_KEY_GENEVE_DATA]) + return -EINVAL; + + attr = tb[NFTA_TUNNEL_KEY_GENEVE_DATA]; + data_len = nla_len(attr); + if (data_len % 4) + return -EINVAL; + + opts->len += sizeof(*opt) + data_len; + if (opts->len > IP_TUNNEL_OPTS_MAX) + return -EINVAL; + + memcpy(opt->opt_data, nla_data(attr), data_len); + opt->length = data_len / 4; + opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]); + opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]); + opts->flags = TUNNEL_GENEVE_OPT; + + return 0; +} + static const struct nla_policy nft_tunnel_opts_policy[NFTA_TUNNEL_KEY_OPTS_MAX + 1] = { + [NFTA_TUNNEL_KEY_OPTS_UNSPEC] = { + .strict_start_type = NFTA_TUNNEL_KEY_OPTS_GENEVE }, [NFTA_TUNNEL_KEY_OPTS_VXLAN] = { .type = NLA_NESTED, }, [NFTA_TUNNEL_KEY_OPTS_ERSPAN] = { .type = NLA_NESTED, }, + [NFTA_TUNNEL_KEY_OPTS_GENEVE] = { .type = NLA_NESTED, }, }; static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, @@ -311,22 +357,43 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx, struct ip_tunnel_info *info, struct nft_tunnel_opts *opts) { - struct nlattr *tb[NFTA_TUNNEL_KEY_OPTS_MAX + 1]; - int err; + int err, rem, type = 0; + struct nlattr *nla; - err = nla_parse_nested_deprecated(tb, NFTA_TUNNEL_KEY_OPTS_MAX, attr, - nft_tunnel_opts_policy, NULL); + err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX, + nft_tunnel_opts_policy, NULL); if (err < 0) return err; - if (tb[NFTA_TUNNEL_KEY_OPTS_VXLAN]) { - err = nft_tunnel_obj_vxlan_init(tb[NFTA_TUNNEL_KEY_OPTS_VXLAN], - opts); - } else if (tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN]) { - err = nft_tunnel_obj_erspan_init(tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN], - opts); - } else { - return -EOPNOTSUPP; + nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) { + switch (nla_type(nla)) { + case NFTA_TUNNEL_KEY_OPTS_VXLAN: + if (type) + return -EINVAL; + err = nft_tunnel_obj_vxlan_init(nla, opts); + if (err) + return err; + type = TUNNEL_VXLAN_OPT; + break; + case NFTA_TUNNEL_KEY_OPTS_ERSPAN: + if (type) + return -EINVAL; + err = nft_tunnel_obj_erspan_init(nla, opts); + if (err) + return err; + type = TUNNEL_ERSPAN_OPT; + break; + case NFTA_TUNNEL_KEY_OPTS_GENEVE: + if (type && type != TUNNEL_GENEVE_OPT) + return -EINVAL; + err = nft_tunnel_obj_geneve_init(nla, opts); + if (err) + return err; + type = TUNNEL_GENEVE_OPT; + break; + default: + return -EOPNOTSUPP; + } } return err; @@ -518,6 +585,25 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, break; } nla_nest_end(skb, inner); + } else if (opts->flags & TUNNEL_GENEVE_OPT) { + struct geneve_opt *opt; + int offset = 0; + + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); + if (!inner) + goto failure; + while (opts->len > offset) { + opt = (struct geneve_opt *)opts->u.data + offset; + if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS, + opt->opt_class) || + nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE, + opt->type) || + nla_put(skb, NFTA_TUNNEL_KEY_GENEVE_DATA, + opt->length * 4, opt->opt_data)) + goto inner_failure; + offset += sizeof(*opt) + opt->length * 4; + } + nla_nest_end(skb, inner); } nla_nest_end(skb, nest); return 0; diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index f56d3ed93e56..75bd0e5dd312 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/timer.h> +#include <linux/alarmtimer.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/netfilter.h> @@ -30,6 +31,7 @@ struct idletimer_tg { struct list_head entry; + struct alarm alarm; struct timer_list timer; struct work_struct work; @@ -37,6 +39,7 @@ struct idletimer_tg { struct device_attribute attr; unsigned int refcnt; + u8 timer_type; }; static LIST_HEAD(idletimer_tg_list); @@ -62,20 +65,29 @@ static ssize_t idletimer_tg_show(struct device *dev, { struct idletimer_tg *timer; unsigned long expires = 0; + struct timespec64 ktimespec = {}; + long time_diff = 0; mutex_lock(&list_mutex); timer = __idletimer_tg_find_by_label(attr->attr.name); - if (timer) - expires = timer->timer.expires; + if (timer) { + if (timer->timer_type & XT_IDLETIMER_ALARM) { + ktime_t expires_alarm = alarm_expires_remaining(&timer->alarm); + ktimespec = ktime_to_timespec64(expires_alarm); + time_diff = ktimespec.tv_sec; + } else { + expires = timer->timer.expires; + time_diff = jiffies_to_msecs(expires - jiffies) / 1000; + } + } mutex_unlock(&list_mutex); - if (time_after(expires, jiffies)) - return sprintf(buf, "%u\n", - jiffies_to_msecs(expires - jiffies) / 1000); + if (time_after(expires, jiffies) || ktimespec.tv_sec > 0) + return snprintf(buf, PAGE_SIZE, "%ld\n", time_diff); - return sprintf(buf, "0\n"); + return snprintf(buf, PAGE_SIZE, "0\n"); } static void idletimer_tg_work(struct work_struct *work) @@ -95,6 +107,16 @@ static void idletimer_tg_expired(struct timer_list *t) schedule_work(&timer->work); } +static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm, + ktime_t now) +{ + struct idletimer_tg *timer = alarm->data; + + pr_debug("alarm %s expired\n", timer->attr.attr.name); + schedule_work(&timer->work); + return ALARMTIMER_NORESTART; +} + static int idletimer_check_sysfs_name(const char *name, unsigned int size) { int ret; @@ -160,6 +182,68 @@ out: return ret; } +static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) +{ + int ret; + + info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); + if (!info->timer) { + ret = -ENOMEM; + goto out; + } + + ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); + if (ret < 0) + goto out_free_timer; + + sysfs_attr_init(&info->timer->attr.attr); + info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); + if (!info->timer->attr.attr.name) { + ret = -ENOMEM; + goto out_free_timer; + } + info->timer->attr.attr.mode = 0444; + info->timer->attr.show = idletimer_tg_show; + + ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); + if (ret < 0) { + pr_debug("couldn't add file to sysfs"); + goto out_free_attr; + } + + /* notify userspace */ + kobject_uevent(idletimer_tg_kobj,KOBJ_ADD); + + list_add(&info->timer->entry, &idletimer_tg_list); + pr_debug("timer type value is %u", info->timer_type); + info->timer->timer_type = info->timer_type; + info->timer->refcnt = 1; + + INIT_WORK(&info->timer->work, idletimer_tg_work); + + if (info->timer->timer_type & XT_IDLETIMER_ALARM) { + ktime_t tout; + alarm_init(&info->timer->alarm, ALARM_BOOTTIME, + idletimer_tg_alarmproc); + info->timer->alarm.data = info->timer; + tout = ktime_set(info->timeout, 0); + alarm_start_relative(&info->timer->alarm, tout); + } else { + timer_setup(&info->timer->timer, idletimer_tg_expired, 0); + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + } + + return 0; + +out_free_attr: + kfree(info->timer->attr.attr.name); +out_free_timer: + kfree(info->timer); +out: + return ret; +} + /* * The actual xt_tables plugin. */ @@ -177,13 +261,30 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb, return XT_CONTINUE; } -static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) +/* + * The actual xt_tables plugin. + */ +static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, + const struct xt_action_param *par) { - struct idletimer_tg_info *info = par->targinfo; - int ret; + const struct idletimer_tg_info_v1 *info = par->targinfo; - pr_debug("checkentry targinfo%s\n", info->label); + pr_debug("resetting timer %s, timeout period %u\n", + info->label, info->timeout); + + if (info->timer->timer_type & XT_IDLETIMER_ALARM) { + ktime_t tout = ktime_set(info->timeout, 0); + alarm_start_relative(&info->timer->alarm, tout); + } else { + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + } + return XT_CONTINUE; +} + +static int idletimer_tg_helper(struct idletimer_tg_info *info) +{ if (info->timeout == 0) { pr_debug("timeout value is zero\n"); return -EINVAL; @@ -198,7 +299,23 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) pr_debug("label is empty or not nul-terminated\n"); return -EINVAL; } + return 0; +} + +static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) +{ + struct idletimer_tg_info *info = par->targinfo; + int ret; + + pr_debug("checkentry targinfo%s\n", info->label); + + ret = idletimer_tg_helper(info); + if(ret < 0) + { + pr_debug("checkentry helper return invalid\n"); + return -EINVAL; + } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); @@ -222,6 +339,65 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) return 0; } +static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) +{ + struct idletimer_tg_info_v1 *info = par->targinfo; + int ret; + + pr_debug("checkentry targinfo%s\n", info->label); + + ret = idletimer_tg_helper((struct idletimer_tg_info *)info); + if(ret < 0) + { + pr_debug("checkentry helper return invalid\n"); + return -EINVAL; + } + + if (info->timer_type > XT_IDLETIMER_ALARM) { + pr_debug("invalid value for timer type\n"); + return -EINVAL; + } + + mutex_lock(&list_mutex); + + info->timer = __idletimer_tg_find_by_label(info->label); + if (info->timer) { + if (info->timer->timer_type != info->timer_type) { + pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n"); + mutex_unlock(&list_mutex); + return -EINVAL; + } + + info->timer->refcnt++; + if (info->timer_type & XT_IDLETIMER_ALARM) { + /* calculate remaining expiry time */ + ktime_t tout = alarm_expires_remaining(&info->timer->alarm); + struct timespec64 ktimespec = ktime_to_timespec64(tout); + + if (ktimespec.tv_sec > 0) { + pr_debug("time_expiry_remaining %lld\n", + ktimespec.tv_sec); + alarm_start_relative(&info->timer->alarm, tout); + } + } else { + mod_timer(&info->timer->timer, + msecs_to_jiffies(info->timeout * 1000) + jiffies); + } + pr_debug("increased refcnt of timer %s to %u\n", + info->label, info->timer->refcnt); + } else { + ret = idletimer_tg_create_v1(info); + if (ret < 0) { + pr_debug("failed to create timer\n"); + mutex_unlock(&list_mutex); + return ret; + } + } + + mutex_unlock(&list_mutex); + return 0; +} + static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info *info = par->targinfo; @@ -247,7 +423,38 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) mutex_unlock(&list_mutex); } -static struct xt_target idletimer_tg __read_mostly = { +static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) +{ + const struct idletimer_tg_info_v1 *info = par->targinfo; + + pr_debug("destroy targinfo %s\n", info->label); + + mutex_lock(&list_mutex); + + if (--info->timer->refcnt == 0) { + pr_debug("deleting timer %s\n", info->label); + + list_del(&info->timer->entry); + if (info->timer->timer_type & XT_IDLETIMER_ALARM) { + alarm_cancel(&info->timer->alarm); + } else { + del_timer_sync(&info->timer->timer); + } + cancel_work_sync(&info->timer->work); + sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); + kfree(info->timer->attr.attr.name); + kfree(info->timer); + } else { + pr_debug("decreased refcnt of timer %s to %u\n", + info->label, info->timer->refcnt); + } + + mutex_unlock(&list_mutex); +} + + +static struct xt_target idletimer_tg[] __read_mostly = { + { .name = "IDLETIMER", .family = NFPROTO_UNSPEC, .target = idletimer_tg_target, @@ -256,6 +463,20 @@ static struct xt_target idletimer_tg __read_mostly = { .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, + }, + { + .name = "IDLETIMER", + .family = NFPROTO_UNSPEC, + .revision = 1, + .target = idletimer_tg_target_v1, + .targetsize = sizeof(struct idletimer_tg_info_v1), + .usersize = offsetof(struct idletimer_tg_info_v1, timer), + .checkentry = idletimer_tg_checkentry_v1, + .destroy = idletimer_tg_destroy_v1, + .me = THIS_MODULE, + }, + + }; static struct class *idletimer_tg_class; @@ -283,7 +504,8 @@ static int __init idletimer_tg_init(void) idletimer_tg_kobj = &idletimer_tg_device->kobj; - err = xt_register_target(&idletimer_tg); + err = xt_register_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); + if (err < 0) { pr_debug("couldn't register xt target\n"); goto out_dev; @@ -300,7 +522,7 @@ out: static void __exit idletimer_tg_exit(void) { - xt_unregister_target(&idletimer_tg); + xt_unregister_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); device_destroy(idletimer_tg_class, MKDEV(0, 0)); class_destroy(idletimer_tg_class); diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 2317721f3ecb..75625d13e976 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -21,8 +21,6 @@ MODULE_DESCRIPTION("Xtables: packet security mark modification"); MODULE_ALIAS("ipt_SECMARK"); MODULE_ALIAS("ip6t_SECMARK"); -#define PFX "SECMARK: " - static u8 mode; static unsigned int diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 8c835ad63729..9c5cfd74a0ee 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -132,7 +132,7 @@ struct xt_hashlimit_htable { const char *name; struct net *net; - struct hlist_head hash[0]; /* hashtable itself */ + struct hlist_head hash[]; /* hashtable itself */ }; static int diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 225a7ab6d79a..19bef176145e 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -71,7 +71,7 @@ struct recent_entry { u_int8_t ttl; u_int8_t index; u_int16_t nstamps; - unsigned long stamps[0]; + unsigned long stamps[]; }; struct recent_table { @@ -82,7 +82,7 @@ struct recent_table { unsigned int entries; u8 nstamps_max_mask; struct list_head lru_list; - struct list_head iphash[0]; + struct list_head iphash[]; }; struct recent_net { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ed77c75bf63f..5ded01ca8b20 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2392,19 +2392,14 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, if (nlk_has_extack && extack && extack->_msg) tlvlen += nla_total_size(strlen(extack->_msg) + 1); - if (err) { - if (!(nlk->flags & NETLINK_F_CAP_ACK)) - payload += nlmsg_len(nlh); - else - flags |= NLM_F_CAPPED; - if (nlk_has_extack && extack && extack->bad_attr) - tlvlen += nla_total_size(sizeof(u32)); - } else { + if (err && !(nlk->flags & NETLINK_F_CAP_ACK)) + payload += nlmsg_len(nlh); + else flags |= NLM_F_CAPPED; - - if (nlk_has_extack && extack && extack->cookie_len) - tlvlen += nla_total_size(extack->cookie_len); - } + if (err && nlk_has_extack && extack && extack->bad_attr) + tlvlen += nla_total_size(sizeof(u32)); + if (nlk_has_extack && extack && extack->cookie_len) + tlvlen += nla_total_size(extack->cookie_len); if (tlvlen) flags |= NLM_F_ACK_TLVS; @@ -2427,20 +2422,16 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)); } - if (err) { - if (extack->bad_attr && - !WARN_ON((u8 *)extack->bad_attr < in_skb->data || - (u8 *)extack->bad_attr >= in_skb->data + - in_skb->len)) - WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, - (u8 *)extack->bad_attr - - (u8 *)nlh)); - } else { - if (extack->cookie_len) - WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, - extack->cookie_len, - extack->cookie)); - } + if (err && extack->bad_attr && + !WARN_ON((u8 *)extack->bad_attr < in_skb->data || + (u8 *)extack->bad_attr >= in_skb->data + + in_skb->len)) + WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, + (u8 *)extack->bad_attr - + (u8 *)nlh)); + if (extack->cookie_len) + WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, + extack->cookie_len, extack->cookie)); } nlmsg_end(skb, rep); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 07a7dd185995..d8ae541d22a8 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -305,7 +305,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, struct sk_buff *segs, *nskb; int err; - BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET); + BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_GSO_CB_OFFSET); segs = __skb_gso_segment(skb, NETIF_F_SG, false); if (IS_ERR(segs)) return PTR_ERR(segs); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e5b0986215d2..29bd405adbbd 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2173,6 +2173,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct timespec64 ts; __u32 ts_status; bool is_drop_n_account = false; + unsigned int slot_id = 0; bool do_vnet = false; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. @@ -2275,6 +2276,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (!h.raw) goto drop_n_account; + if (po->tp_version <= TPACKET_V2) { + slot_id = po->rx_ring.head; + if (test_bit(slot_id, po->rx_ring.rx_owner_map)) + goto drop_n_account; + __set_bit(slot_id, po->rx_ring.rx_owner_map); + } + if (do_vnet && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), @@ -2380,7 +2388,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, #endif if (po->tp_version <= TPACKET_V2) { + spin_lock(&sk->sk_receive_queue.lock); __packet_set_status(po, h.raw, status); + __clear_bit(slot_id, po->rx_ring.rx_owner_map); + spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); } else { prb_clear_blk_fill_status(&po->rx_ring); @@ -4277,6 +4288,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, { struct pgv *pg_vec = NULL; struct packet_sock *po = pkt_sk(sk); + unsigned long *rx_owner_map = NULL; int was_running, order = 0; struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; @@ -4362,6 +4374,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, } break; default: + if (!tx_ring) { + rx_owner_map = bitmap_alloc(req->tp_frame_nr, + GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); + if (!rx_owner_map) + goto out_free_pg_vec; + } break; } } @@ -4391,6 +4409,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, err = 0; spin_lock_bh(&rb_queue->lock); swap(rb->pg_vec, pg_vec); + if (po->tp_version <= TPACKET_V2) + swap(rb->rx_owner_map, rx_owner_map); rb->frame_max = (req->tp_frame_nr - 1); rb->head = 0; rb->frame_size = req->tp_frame_size; @@ -4422,6 +4442,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, } out_free_pg_vec: + bitmap_free(rx_owner_map); if (pg_vec) free_pg_vec(pg_vec, order, req->tp_block_nr); out: diff --git a/net/packet/internal.h b/net/packet/internal.h index 82fb2b10f790..907f4cd2a718 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -70,7 +70,10 @@ struct packet_ring_buffer { unsigned int __percpu *pending_refcnt; - struct tpacket_kbdq_core prb_bdqc; + union { + unsigned long *rx_owner_map; + struct tpacket_kbdq_core prb_bdqc; + }; }; extern struct mutex fanout_mutex; diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index fe42f986cd94..15ee92d79581 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -285,7 +285,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, gfp_t gfp, rxrpc_notify_rx_t notify_rx, bool upgrade, - bool intr, + enum rxrpc_interruptibility interruptibility, unsigned int debug_id) { struct rxrpc_conn_parameters cp; @@ -310,7 +310,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, memset(&p, 0, sizeof(p)); p.user_call_ID = user_call_ID; p.tx_total_len = tx_total_len; - p.intr = intr; + p.interruptibility = interruptibility; memset(&cp, 0, sizeof(cp)); cp.local = rx->local; @@ -371,45 +371,18 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call); * rxrpc_kernel_check_life - Check to see whether a call is still alive * @sock: The socket the call is on * @call: The call to check - * @_life: Where to store the life value * - * Allow a kernel service to find out whether a call is still alive - ie. we're - * getting ACKs from the server. Passes back in *_life a number representing - * the life state which can be compared to that returned by a previous call and - * return true if the call is still alive. - * - * If the life state stalls, rxrpc_kernel_probe_life() should be called and - * then 2RTT waited. + * Allow a kernel service to find out whether a call is still alive - + * ie. whether it has completed. */ bool rxrpc_kernel_check_life(const struct socket *sock, - const struct rxrpc_call *call, - u32 *_life) + const struct rxrpc_call *call) { - *_life = call->acks_latest; return call->state != RXRPC_CALL_COMPLETE; } EXPORT_SYMBOL(rxrpc_kernel_check_life); /** - * rxrpc_kernel_probe_life - Poke the peer to see if it's still alive - * @sock: The socket the call is on - * @call: The call to check - * - * In conjunction with rxrpc_kernel_check_life(), allow a kernel service to - * find out whether a call is still alive by pinging it. This should cause the - * life state to be bumped in about 2*RTT. - * - * The must be called in TASK_RUNNING state on pain of might_sleep() objecting. - */ -void rxrpc_kernel_probe_life(struct socket *sock, struct rxrpc_call *call) -{ - rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false, - rxrpc_propose_ack_ping_for_check_life); - rxrpc_send_ack_packet(call, true, NULL); -} -EXPORT_SYMBOL(rxrpc_kernel_probe_life); - -/** * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. * @sock: The socket the call is on * @call: The call to query diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7d730c438404..3eb1ab40ca5c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -489,7 +489,6 @@ enum rxrpc_call_flag { RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */ - RXRPC_CALL_IS_INTR, /* The call is interruptible */ RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */ }; @@ -598,6 +597,7 @@ struct rxrpc_call { atomic_t usage; u16 service_id; /* service ID */ u8 security_ix; /* Security type */ + enum rxrpc_interruptibility interruptibility; /* At what point call may be interrupted */ u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ int debug_id; /* debug ID for printks */ @@ -675,7 +675,6 @@ struct rxrpc_call { /* transmission-phase ACK management */ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ - rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_seq_t acks_lost_top; /* tx_top at the time lost-ack ping sent */ rxrpc_serial_t acks_lost_ping; /* Serial number of probe ACK */ @@ -721,7 +720,7 @@ struct rxrpc_call_params { u32 normal; /* Max time since last call packet (msec) */ } timeouts; u8 nr_timeouts; /* Number of timeouts specified */ - bool intr; /* The call is interruptible */ + enum rxrpc_interruptibility interruptibility; /* How is interruptible is the call? */ }; struct rxrpc_send_params { diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index c9f34b0a11df..f07970207b54 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -237,8 +237,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, return call; } - if (p->intr) - __set_bit(RXRPC_CALL_IS_INTR, &call->flags); + call->interruptibility = p->interruptibility; call->tx_total_len = p->tx_total_len; trace_rxrpc_call(call->debug_id, rxrpc_call_new_client, atomic_read(&call->usage), diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index ea7d4c21f889..f2a1a5dbb5a7 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -655,13 +655,20 @@ static int rxrpc_wait_for_channel(struct rxrpc_call *call, gfp_t gfp) add_wait_queue_exclusive(&call->waitq, &myself); for (;;) { - if (test_bit(RXRPC_CALL_IS_INTR, &call->flags)) + switch (call->interruptibility) { + case RXRPC_INTERRUPTIBLE: + case RXRPC_PREINTERRUPTIBLE: set_current_state(TASK_INTERRUPTIBLE); - else + break; + case RXRPC_UNINTERRUPTIBLE: + default: set_current_state(TASK_UNINTERRUPTIBLE); + break; + } if (call->call_id) break; - if (test_bit(RXRPC_CALL_IS_INTR, &call->flags) && + if ((call->interruptibility == RXRPC_INTERRUPTIBLE || + call->interruptibility == RXRPC_PREINTERRUPTIBLE) && signal_pending(current)) { ret = -ERESTARTSYS; break; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index ef10fbf71b15..69e09d69c896 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -882,7 +882,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) before(prev_pkt, call->ackr_prev_seq)) goto out; call->acks_latest_ts = skb->tstamp; - call->acks_latest = sp->hdr.serial; call->ackr_first_seq = first_soft_ack; call->ackr_prev_seq = prev_pkt; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 813fd6888142..0fcf157aa09f 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -18,6 +18,21 @@ #include "ar-internal.h" /* + * Return true if there's sufficient Tx queue space. + */ +static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) +{ + unsigned int win_size = + min_t(unsigned int, call->tx_winsize, + call->cong_cwnd + call->cong_extra); + rxrpc_seq_t tx_win = READ_ONCE(call->tx_hard_ack); + + if (_tx_win) + *_tx_win = tx_win; + return call->tx_top - tx_win < win_size; +} + +/* * Wait for space to appear in the Tx queue or a signal to occur. */ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, @@ -26,9 +41,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, { for (;;) { set_current_state(TASK_INTERRUPTIBLE); - if (call->tx_top - call->tx_hard_ack < - min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra)) + if (rxrpc_check_tx_space(call, NULL)) return 0; if (call->state >= RXRPC_CALL_COMPLETE) @@ -49,7 +62,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, * Wait for space to appear in the Tx queue uninterruptibly, but with * a timeout of 2*RTT if no progress was made and a signal occurred. */ -static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, +static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, struct rxrpc_call *call) { rxrpc_seq_t tx_start, tx_win; @@ -58,8 +71,8 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, rtt = READ_ONCE(call->peer->rtt); rtt2 = nsecs_to_jiffies64(rtt) * 2; - if (rtt2 < 1) - rtt2 = 1; + if (rtt2 < 2) + rtt2 = 2; timeout = rtt2; tx_start = READ_ONCE(call->tx_hard_ack); @@ -68,16 +81,13 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, set_current_state(TASK_UNINTERRUPTIBLE); tx_win = READ_ONCE(call->tx_hard_ack); - if (call->tx_top - tx_win < - min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra)) + if (rxrpc_check_tx_space(call, &tx_win)) return 0; if (call->state >= RXRPC_CALL_COMPLETE) return call->error; - if (test_bit(RXRPC_CALL_IS_INTR, &call->flags) && - timeout == 0 && + if (timeout == 0 && tx_win == tx_start && signal_pending(current)) return -EINTR; @@ -92,6 +102,26 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, } /* + * Wait for space to appear in the Tx queue uninterruptibly. + */ +static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, + struct rxrpc_call *call, + long *timeo) +{ + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (rxrpc_check_tx_space(call, NULL)) + return 0; + + if (call->state >= RXRPC_CALL_COMPLETE) + return call->error; + + trace_rxrpc_transmit(call, rxrpc_transmit_wait); + *timeo = schedule_timeout(*timeo); + } +} + +/* * wait for space to appear in the transmit/ACK window * - caller holds the socket locked */ @@ -108,10 +138,19 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, add_wait_queue(&call->waitq, &myself); - if (waitall) - ret = rxrpc_wait_for_tx_window_nonintr(rx, call); - else - ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo); + switch (call->interruptibility) { + case RXRPC_INTERRUPTIBLE: + if (waitall) + ret = rxrpc_wait_for_tx_window_waitall(rx, call); + else + ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo); + break; + case RXRPC_PREINTERRUPTIBLE: + case RXRPC_UNINTERRUPTIBLE: + default: + ret = rxrpc_wait_for_tx_window_nonintr(rx, call, timeo); + break; + } remove_wait_queue(&call->waitq, &myself); set_current_state(TASK_RUNNING); @@ -302,9 +341,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, _debug("alloc"); - if (call->tx_top - call->tx_hard_ack >= - min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra)) { + if (!rxrpc_check_tx_space(call, NULL)) { ret = -EAGAIN; if (msg->msg_flags & MSG_DONTWAIT) goto maybe_error; @@ -619,7 +656,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) .call.tx_total_len = -1, .call.user_call_ID = 0, .call.nr_timeouts = 0, - .call.intr = true, + .call.interruptibility = RXRPC_INTERRUPTIBLE, .abort_code = 0, .command = RXRPC_CMD_SEND_DATA, .exclusive = false, diff --git a/net/sched/act_api.c b/net/sched/act_api.c index aa7b737fed2e..df4560909157 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -185,7 +185,7 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act) return nla_total_size(0) /* action number nested */ + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ + cookie_len /* TCA_ACT_COOKIE */ - + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_HW_STATS_TYPE */ + + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_HW_STATS */ + nla_total_size(0) /* TCA_ACT_STATS nested */ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_FLAGS */ /* TCA_STATS_BASIC */ @@ -789,24 +789,20 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) } rcu_read_unlock(); - if (a->hw_stats_type != TCA_ACT_HW_STATS_TYPE_ANY) { - struct nla_bitfield32 hw_stats_type = { - a->hw_stats_type, - TCA_ACT_HW_STATS_TYPE_ANY, - }; - - if (nla_put(skb, TCA_ACT_HW_STATS_TYPE, sizeof(hw_stats_type), - &hw_stats_type)) - goto nla_put_failure; - } + if (a->hw_stats != TCA_ACT_HW_STATS_ANY && + nla_put_bitfield32(skb, TCA_ACT_HW_STATS, + a->hw_stats, TCA_ACT_HW_STATS_ANY)) + goto nla_put_failure; - if (a->tcfa_flags) { - struct nla_bitfield32 flags = { a->tcfa_flags, - a->tcfa_flags, }; + if (a->used_hw_stats_valid && + nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS, + a->used_hw_stats, TCA_ACT_HW_STATS_ANY)) + goto nla_put_failure; - if (nla_put(skb, TCA_ACT_FLAGS, sizeof(flags), &flags)) - goto nla_put_failure; - } + if (a->tcfa_flags && + nla_put_bitfield32(skb, TCA_ACT_FLAGS, + a->tcfa_flags, a->tcfa_flags)) + goto nla_put_failure; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) @@ -866,22 +862,22 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } -static u8 tcf_action_hw_stats_type_get(struct nlattr *hw_stats_type_attr) +static u8 tcf_action_hw_stats_get(struct nlattr *hw_stats_attr) { - struct nla_bitfield32 hw_stats_type_bf; + struct nla_bitfield32 hw_stats_bf; /* If the user did not pass the attr, that means he does * not care about the type. Return "any" in that case * which is setting on all supported types. */ - if (!hw_stats_type_attr) - return TCA_ACT_HW_STATS_TYPE_ANY; - hw_stats_type_bf = nla_get_bitfield32(hw_stats_type_attr); - return hw_stats_type_bf.value; + if (!hw_stats_attr) + return TCA_ACT_HW_STATS_ANY; + hw_stats_bf = nla_get_bitfield32(hw_stats_attr); + return hw_stats_bf.value; } static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS; -static const u32 tca_act_hw_stats_type_allowed = TCA_ACT_HW_STATS_TYPE_ANY; +static const u32 tca_act_hw_stats_allowed = TCA_ACT_HW_STATS_ANY; static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_KIND] = { .type = NLA_STRING }, @@ -891,8 +887,8 @@ static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, [TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32, .validation_data = &tca_act_flags_allowed }, - [TCA_ACT_HW_STATS_TYPE] = { .type = NLA_BITFIELD32, - .validation_data = &tca_act_hw_stats_type_allowed }, + [TCA_ACT_HW_STATS] = { .type = NLA_BITFIELD32, + .validation_data = &tca_act_hw_stats_allowed }, }; struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, @@ -901,8 +897,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { - u8 hw_stats_type = TCA_ACT_HW_STATS_TYPE_ANY; struct nla_bitfield32 flags = { 0, 0 }; + u8 hw_stats = TCA_ACT_HW_STATS_ANY; struct tc_action *a; struct tc_action_ops *a_o; struct tc_cookie *cookie = NULL; @@ -934,8 +930,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, goto err_out; } } - hw_stats_type = - tcf_action_hw_stats_type_get(tb[TCA_ACT_HW_STATS_TYPE]); + hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]); if (tb[TCA_ACT_FLAGS]) flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); } else { @@ -987,7 +982,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, tcf_set_action_cookie(&a->act_cookie, cookie); if (!name) - a->hw_stats_type = hw_stats_type; + a->hw_stats = hw_stats; /* module count goes up only when brand new policy is created * if it exists and is only bound to in a_o->init() then diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 56b66d215a89..1a766393be62 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1273,7 +1273,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); if (params) - kfree_rcu(params, rcu); + call_rcu(¶ms->rcu, tcf_ct_params_free); if (res == ACT_P_CREATED) tcf_idr_insert(tn, *a); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 1ad300e6dbc0..83dd82fc9f40 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -284,10 +284,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, /* mirror is always swallowed */ if (is_redirect) { - skb2->tc_redirected = 1; - skb2->tc_from_ingress = skb2->tc_at_ingress; - if (skb2->tc_from_ingress) - skb2->tstamp = 0; + skb_set_redirected(skb2, skb2->tc_at_ingress); + /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 3ad718576304..d41d6200d9de 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -409,6 +409,16 @@ done: return p->tcf_action; } +static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u32 packets, + u64 lastuse, bool hw) +{ + struct tcf_pedit *d = to_pedit(a); + struct tcf_t *tm = &d->tcf_tm; + + tcf_action_update_stats(a, bytes, packets, false, hw); + tm->lastuse = max_t(u64, tm->lastuse, lastuse); +} + static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -485,6 +495,7 @@ static struct tc_action_ops act_pedit_ops = { .id = TCA_ID_PEDIT, .owner = THIS_MODULE, .act = tcf_pedit_act, + .stats_update = tcf_pedit_stats_update, .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index e857424c387c..b125b2be4467 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -73,6 +73,16 @@ err: return TC_ACT_SHOT; } +static void tcf_skbedit_stats_update(struct tc_action *a, u64 bytes, + u32 packets, u64 lastuse, bool hw) +{ + struct tcf_skbedit *d = to_skbedit(a); + struct tcf_t *tm = &d->tcf_tm; + + tcf_action_update_stats(a, bytes, packets, false, hw); + tm->lastuse = max_t(u64, tm->lastuse, lastuse); +} + static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) }, [TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) }, @@ -323,6 +333,7 @@ static struct tc_action_ops act_skbedit_ops = { .id = TCA_ID_SKBEDIT, .owner = THIS_MODULE, .act = tcf_skbedit_act, + .stats_update = tcf_skbedit_stats_update, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, .cleanup = tcf_skbedit_cleanup, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 363264ca2e09..f6a3b969ead0 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -708,7 +708,7 @@ static void tc_indr_block_call(struct tcf_block *block, }; INIT_LIST_HEAD(&bo.cb_list); - flow_indr_block_call(dev, &bo, command); + flow_indr_block_call(dev, &bo, command, TC_SETUP_BLOCK); tcf_block_setup(block, &bo); } @@ -3528,9 +3528,9 @@ int tc_setup_flow_action(struct flow_action *flow_action, struct tc_action *act; int i, j, k, err = 0; - BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_ANY != FLOW_ACTION_HW_STATS_TYPE_ANY); - BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_IMMEDIATE != FLOW_ACTION_HW_STATS_TYPE_IMMEDIATE); - BUILD_BUG_ON(TCA_ACT_HW_STATS_TYPE_DELAYED != FLOW_ACTION_HW_STATS_TYPE_DELAYED); + BUILD_BUG_ON(TCA_ACT_HW_STATS_ANY != FLOW_ACTION_HW_STATS_ANY); + BUILD_BUG_ON(TCA_ACT_HW_STATS_IMMEDIATE != FLOW_ACTION_HW_STATS_IMMEDIATE); + BUILD_BUG_ON(TCA_ACT_HW_STATS_DELAYED != FLOW_ACTION_HW_STATS_DELAYED); if (!exts) return 0; @@ -3545,7 +3545,7 @@ int tc_setup_flow_action(struct flow_action *flow_action, if (err) goto err_out_locked; - entry->hw_stats_type = act->hw_stats_type; + entry->hw_stats = act->hw_stats; if (is_tcf_gact_ok(act)) { entry->id = FLOW_ACTION_ACCEPT; @@ -3613,8 +3613,8 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->mangle.mask = tcf_pedit_mask(act, k); entry->mangle.val = tcf_pedit_val(act, k); entry->mangle.offset = tcf_pedit_offset(act, k); + entry->hw_stats = act->hw_stats; entry = &flow_action->entries[++j]; - entry->hw_stats_type = act->hw_stats_type; } } else if (is_tcf_csum(act)) { entry->id = FLOW_ACTION_CSUM; @@ -3665,6 +3665,9 @@ int tc_setup_flow_action(struct flow_action *flow_action, } else if (is_tcf_skbedit_ptype(act)) { entry->id = FLOW_ACTION_PTYPE; entry->ptype = tcf_skbedit_ptype(act); + } else if (is_tcf_skbedit_priority(act)) { + entry->id = FLOW_ACTION_PRIORITY; + entry->priority = tcf_skbedit_priority(act); } else { err = -EOPNOTSUPP; goto err_out_locked; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 258dc45ab7e3..74a0febcafb8 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -492,7 +492,9 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f, tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes, cls_flower.stats.pkts, - cls_flower.stats.lastused); + cls_flower.stats.lastused, + cls_flower.stats.used_hw_stats, + cls_flower.stats.used_hw_stats_valid); } static void __fl_put(struct cls_fl_filter *f) @@ -738,7 +740,8 @@ static void fl_set_key_val(struct nlattr **tb, } static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, - struct fl_flow_key *mask) + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) { fl_set_key_val(tb, &key->tp_range.tp_min.dst, TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_range.tp_min.dst, @@ -753,20 +756,30 @@ static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_range.tp_max.src, TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src)); - if ((mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst && - htons(key->tp_range.tp_max.dst) <= - htons(key->tp_range.tp_min.dst)) || - (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src && - htons(key->tp_range.tp_max.src) <= - htons(key->tp_range.tp_min.src))) + if (mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst && + htons(key->tp_range.tp_max.dst) <= + htons(key->tp_range.tp_min.dst)) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_PORT_DST_MIN], + "Invalid destination port range (min must be strictly smaller than max)"); return -EINVAL; + } + if (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src && + htons(key->tp_range.tp_max.src) <= + htons(key->tp_range.tp_min.src)) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_PORT_SRC_MIN], + "Invalid source port range (min must be strictly smaller than max)"); + return -EINVAL; + } return 0; } static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_key_mpls *key_val, - struct flow_dissector_key_mpls *key_mask) + struct flow_dissector_key_mpls *key_mask, + struct netlink_ext_ack *extack) { if (tb[TCA_FLOWER_KEY_MPLS_TTL]) { key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]); @@ -775,24 +788,36 @@ static int fl_set_key_mpls(struct nlattr **tb, if (tb[TCA_FLOWER_KEY_MPLS_BOS]) { u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]); - if (bos & ~MPLS_BOS_MASK) + if (bos & ~MPLS_BOS_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_BOS], + "Bottom Of Stack (BOS) must be 0 or 1"); return -EINVAL; + } key_val->mpls_bos = bos; key_mask->mpls_bos = MPLS_BOS_MASK; } if (tb[TCA_FLOWER_KEY_MPLS_TC]) { u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]); - if (tc & ~MPLS_TC_MASK) + if (tc & ~MPLS_TC_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_TC], + "Traffic Class (TC) must be between 0 and 7"); return -EINVAL; + } key_val->mpls_tc = tc; key_mask->mpls_tc = MPLS_TC_MASK; } if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) { u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]); - if (label & ~MPLS_LABEL_MASK) + if (label & ~MPLS_LABEL_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_LABEL], + "Label must be between 0 and 1048575"); return -EINVAL; + } key_val->mpls_label = label; key_mask->mpls_label = MPLS_LABEL_MASK; } @@ -833,14 +858,16 @@ static void fl_set_key_flag(u32 flower_key, u32 flower_mask, } } -static int fl_set_key_flags(struct nlattr **tb, - u32 *flags_key, u32 *flags_mask) +static int fl_set_key_flags(struct nlattr **tb, u32 *flags_key, + u32 *flags_mask, struct netlink_ext_ack *extack) { u32 key, mask; /* mask is mandatory for flags */ - if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) + if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) { + NL_SET_ERR_MSG(extack, "Missing flags mask"); return -EINVAL; + } key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS])); mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK])); @@ -1364,7 +1391,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, sizeof(key->icmp.code)); } else if (key->basic.n_proto == htons(ETH_P_MPLS_UC) || key->basic.n_proto == htons(ETH_P_MPLS_MC)) { - ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls); + ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls, extack); if (ret) return ret; } else if (key->basic.n_proto == htons(ETH_P_ARP) || @@ -1389,7 +1416,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, if (key->basic.ip_proto == IPPROTO_TCP || key->basic.ip_proto == IPPROTO_UDP || key->basic.ip_proto == IPPROTO_SCTP) { - ret = fl_set_key_port_range(tb, key, mask); + ret = fl_set_key_port_range(tb, key, mask, extack); if (ret) return ret; } @@ -1451,7 +1478,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb, return ret; if (tb[TCA_FLOWER_KEY_FLAGS]) - ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); + ret = fl_set_key_flags(tb, &key->control.flags, + &mask->control.flags, extack); return ret; } diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index a34b36adb9b7..8d39dbcf1746 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -338,7 +338,9 @@ static void mall_stats_hw_filter(struct tcf_proto *tp, tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true); tcf_exts_stats_update(&head->exts, cls_mall.stats.bytes, - cls_mall.stats.pkts, cls_mall.stats.lastused); + cls_mall.stats.pkts, cls_mall.stats.lastused, + cls_mall.stats.used_hw_stats, + cls_mall.stats.used_hw_stats_valid); } static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 6f8786b06bde..5efa3e7ace15 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -534,8 +534,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, fp = &b->ht[h]; for (pfp = rtnl_dereference(*fp); pfp; fp = &pfp->next, pfp = rtnl_dereference(*fp)) { - if (pfp == f) { - *fp = f->next; + if (pfp == fold) { + rcu_assign_pointer(*fp, fold->next); break; } } diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 09b7dc5fe7e0..9904299424a1 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -261,8 +261,10 @@ static void tcindex_partial_destroy_work(struct work_struct *work) struct tcindex_data, rwork); + rtnl_lock(); kfree(p->perfect); kfree(p); + rtnl_unlock(); } static void tcindex_free_perfect_hash(struct tcindex_data *cp) @@ -357,6 +359,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (tcindex_alloc_perfect_hash(net, cp) < 0) goto errout; + cp->alloc_hash = cp->hash; for (i = 0; i < min(cp->hash, p->hash); i++) cp->perfect[i].res = p->perfect[i].res; balloc = 1; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 50794125bf02..0d99df1e764d 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -618,21 +618,28 @@ void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) } EXPORT_SYMBOL(qdisc_watchdog_init); -void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) +void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, + u64 delta_ns) { if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state)) return; - if (wd->last_expires == expires) - return; + if (hrtimer_is_queued(&wd->timer)) { + /* If timer is already set in [expires, expires + delta_ns], + * do not reprogram it. + */ + if (wd->last_expires - expires <= delta_ns) + return; + } wd->last_expires = expires; - hrtimer_start(&wd->timer, - ns_to_ktime(expires), - HRTIMER_MODE_ABS_PINNED); + hrtimer_start_range_ns(&wd->timer, + ns_to_ktime(expires), + delta_ns, + HRTIMER_MODE_ABS_PINNED); } -EXPORT_SYMBOL(qdisc_watchdog_schedule_ns); +EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index b2905b03a432..2eaac2ff380f 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -181,6 +181,11 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) s64 credits; int len; + /* The previous packet is still being sent */ + if (now < q->last) { + qdisc_watchdog_schedule_ns(&q->watchdog, q->last); + return NULL; + } if (q->credits < 0) { credits = timediff_to_credits(now - q->last, q->idleslope); @@ -212,7 +217,12 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) credits += q->credits; q->credits = max_t(s64, credits, q->locredit); - q->last = now; + /* Estimate of the transmission of the last byte of the packet in ns */ + if (unlikely(atomic64_read(&q->port_rate) == 0)) + q->last = now; + else + q->last = now + div64_s64(len * NSEC_PER_SEC, + atomic64_read(&q->port_rate)); return skb; } diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 371ad84def3b..4c060134c736 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -121,6 +121,8 @@ struct fq_sched_data { u64 stat_flows_plimit; u64 stat_pkts_too_long; u64 stat_allocation_errors; + + u32 timer_slack; /* hrtimer slack in ns */ struct qdisc_watchdog watchdog; }; @@ -504,8 +506,9 @@ begin: head = &q->old_flows; if (!head->first) { if (q->time_next_delayed_flow != ~0ULL) - qdisc_watchdog_schedule_ns(&q->watchdog, - q->time_next_delayed_flow); + qdisc_watchdog_schedule_range_ns(&q->watchdog, + q->time_next_delayed_flow, + q->timer_slack); return NULL; } } @@ -735,6 +738,8 @@ static int fq_resize(struct Qdisc *sch, u32 log) } static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { + [TCA_FQ_UNSPEC] = { .strict_start_type = TCA_FQ_TIMER_SLACK }, + [TCA_FQ_PLIMIT] = { .type = NLA_U32 }, [TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 }, [TCA_FQ_QUANTUM] = { .type = NLA_U32 }, @@ -747,6 +752,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_ORPHAN_MASK] = { .type = NLA_U32 }, [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, + [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt, @@ -833,6 +839,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, q->ce_threshold = (u64)NSEC_PER_USEC * nla_get_u32(tb[TCA_FQ_CE_THRESHOLD]); + if (tb[TCA_FQ_TIMER_SLACK]) + q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); + if (!err) { sch_tree_unlock(sch); err = fq_resize(sch, fq_log); @@ -884,6 +893,8 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->orphan_mask = 1024 - 1; q->low_rate_threshold = 550000 / 8; + q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ + /* Default ce_threshold of 4294 seconds */ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; @@ -924,7 +935,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, q->low_rate_threshold) || nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || - nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) + nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || + nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack)) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -947,7 +959,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.flows_plimit = q->stat_flows_plimit; st.pkts_too_long = q->stat_pkts_too_long; st.allocation_errors = q->stat_allocation_errors; - st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns(); + st.time_next_delayed_flow = q->time_next_delayed_flow + q->timer_slack - + ktime_get_ns(); st.flows = q->flows; st.inactive_flows = q->inactive_flows; st.throttled_flows = q->throttled_flows; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 3ef0a4f7399b..c7de47c942e3 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -349,10 +349,6 @@ static int red_dump_offload_stats(struct Qdisc *sch) static int red_dump(struct Qdisc *sch, struct sk_buff *skb) { struct red_sched_data *q = qdisc_priv(sch); - struct nla_bitfield32 flags_bf = { - .selector = red_supported_flags, - .value = q->flags, - }; struct nlattr *opts = NULL; struct tc_red_qopt opt = { .limit = q->limit, @@ -375,7 +371,8 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) || - nla_put(skb, TCA_RED_FLAGS, sizeof(flags_bf), &flags_bf)) + nla_put_bitfield32(skb, TCA_RED_FLAGS, + q->flags, red_supported_flags)) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/socket.c b/net/socket.c index b79a05de7c6e..2eecf1517f76 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1707,7 +1707,8 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog) int __sys_accept4_file(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags) + int __user *upeer_addrlen, int flags, + unsigned long nofile) { struct socket *sock, *newsock; struct file *newfile; @@ -1738,7 +1739,7 @@ int __sys_accept4_file(struct file *file, unsigned file_flags, */ __module_get(newsock->ops->owner); - newfd = get_unused_fd_flags(flags); + newfd = __get_unused_fd_flags(flags, nofile); if (unlikely(newfd < 0)) { err = newfd; sock_release(newsock); @@ -1807,7 +1808,8 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, f = fdget(fd); if (f.file) { ret = __sys_accept4_file(f.file, 0, upeer_sockaddr, - upeer_addrlen, flags); + upeer_addrlen, flags, + rlimit(RLIMIT_NOFILE)); if (f.flags) fput(f.file); } diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 6d466ebdb64f..871feadbbc19 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -394,6 +394,11 @@ static inline u32 msg_connected(struct tipc_msg *m) return msg_type(m) == TIPC_CONN_MSG; } +static inline u32 msg_direct(struct tipc_msg *m) +{ + return msg_type(m) == TIPC_DIRECT_MSG; +} + static inline u32 msg_errcode(struct tipc_msg *m) { return msg_bits(m, 1, 25, 0xf); diff --git a/net/tipc/node.c b/net/tipc/node.c index 0c88778c88b5..10292c942384 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1586,7 +1586,8 @@ static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: - if (msg_connected(hdr) || msg_named(hdr)) { + if (msg_connected(hdr) || msg_named(hdr) || + msg_direct(hdr)) { tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 693e8902161e..87466607097f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1461,7 +1461,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) } __skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid, true); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 1c5574e2e058..a562ebaaa33c 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -366,7 +366,7 @@ static int tls_do_allocation(struct sock *sk, if (!offload_ctx->open_record) { if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, sk->sk_allocation))) { - sk->sk_prot->enter_memory_pressure(sk); + READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return -ENOMEM; } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 82225bcc1117..156efce50dbd 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -63,13 +63,14 @@ static DEFINE_MUTEX(tcpv4_prot_mutex); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG]; static struct proto_ops tls_sw_proto_ops; static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], - struct proto *base); + const struct proto *base); void update_sk_prot(struct sock *sk, struct tls_context *ctx) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; - sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf]; + WRITE_ONCE(sk->sk_prot, + &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf]); } int wait_on_pending_writer(struct sock *sk, long *timeo) @@ -312,7 +313,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) write_lock_bh(&sk->sk_callback_lock); if (free_ctx) rcu_assign_pointer(icsk->icsk_ulp_data, NULL); - sk->sk_prot = ctx->sk_proto; + WRITE_ONCE(sk->sk_prot, ctx->sk_proto); if (sk->sk_write_space == tls_write_space) sk->sk_write_space = ctx->sk_write_space; write_unlock_bh(&sk->sk_callback_lock); @@ -621,38 +622,39 @@ struct tls_context *tls_ctx_create(struct sock *sk) mutex_init(&ctx->tx_lock); rcu_assign_pointer(icsk->icsk_ulp_data, ctx); - ctx->sk_proto = sk->sk_prot; + ctx->sk_proto = READ_ONCE(sk->sk_prot); return ctx; } static void tls_build_proto(struct sock *sk) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; + const struct proto *prot = READ_ONCE(sk->sk_prot); /* Build IPv6 TLS whenever the address of tcpv6 _prot changes */ if (ip_ver == TLSV6 && - unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { + unlikely(prot != smp_load_acquire(&saved_tcpv6_prot))) { mutex_lock(&tcpv6_prot_mutex); - if (likely(sk->sk_prot != saved_tcpv6_prot)) { - build_protos(tls_prots[TLSV6], sk->sk_prot); - smp_store_release(&saved_tcpv6_prot, sk->sk_prot); + if (likely(prot != saved_tcpv6_prot)) { + build_protos(tls_prots[TLSV6], prot); + smp_store_release(&saved_tcpv6_prot, prot); } mutex_unlock(&tcpv6_prot_mutex); } if (ip_ver == TLSV4 && - unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv4_prot))) { + unlikely(prot != smp_load_acquire(&saved_tcpv4_prot))) { mutex_lock(&tcpv4_prot_mutex); - if (likely(sk->sk_prot != saved_tcpv4_prot)) { - build_protos(tls_prots[TLSV4], sk->sk_prot); - smp_store_release(&saved_tcpv4_prot, sk->sk_prot); + if (likely(prot != saved_tcpv4_prot)) { + build_protos(tls_prots[TLSV4], prot); + smp_store_release(&saved_tcpv4_prot, prot); } mutex_unlock(&tcpv4_prot_mutex); } } static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], - struct proto *base) + const struct proto *base) { prot[TLS_BASE][TLS_BASE] = *base; prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt; diff --git a/net/wireless/core.c b/net/wireless/core.c index 3e25229a059d..341402b4f178 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -693,8 +693,14 @@ int wiphy_register(struct wiphy *wiphy) ~(BIT(NL80211_PREAMBLE_LEGACY) | BIT(NL80211_PREAMBLE_HT) | BIT(NL80211_PREAMBLE_VHT) | + BIT(NL80211_PREAMBLE_HE) | BIT(NL80211_PREAMBLE_DMG)))) return -EINVAL; + if (WARN_ON((wiphy->pmsr_capa->ftm.trigger_based || + wiphy->pmsr_capa->ftm.non_trigger_based) && + !(wiphy->pmsr_capa->ftm.preambles & + BIT(NL80211_PREAMBLE_HE)))) + return -EINVAL; if (WARN_ON(wiphy->pmsr_capa->ftm.bandwidths & ~(BIT(NL80211_CHAN_WIDTH_20_NOHT) | BIT(NL80211_CHAN_WIDTH_20) | diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 750b73a52fd8..5fa402144cda 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #include <linux/if.h> @@ -276,6 +276,8 @@ nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = { [NL80211_PMSR_FTM_REQ_ATTR_NUM_FTMR_RETRIES] = { .type = NLA_U8 }, [NL80211_PMSR_FTM_REQ_ATTR_REQUEST_LCI] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC] = { .type = NLA_FLAG }, + [NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED] = { .type = NLA_FLAG }, + [NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG }, }; static const struct nla_policy @@ -658,6 +660,9 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_HE_BSS_COLOR] = NLA_POLICY_NESTED(he_bss_color_policy), [NL80211_ATTR_TID_CONFIG] = NLA_POLICY_NESTED_ARRAY(nl80211_tid_config_attr_policy), + [NL80211_ATTR_CONTROL_PORT_NO_PREAUTH] = { .type = NLA_FLAG }, + [NL80211_ATTR_PMK_LIFETIME] = NLA_POLICY_MIN(NLA_U32, 1), + [NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100), }; /* policy for the key attributes */ @@ -1884,6 +1889,12 @@ nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap, nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_FTMS_PER_BURST, cap->ftm.max_ftms_per_burst)) return -ENOBUFS; + if (cap->ftm.trigger_based && + nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED)) + return -ENOBUFS; + if (cap->ftm.non_trigger_based && + nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED)) + return -ENOBUFS; nla_nest_end(msg, ftm); return 0; @@ -4748,6 +4759,9 @@ static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_CAPABILITY, ies, ies_len); if (cap && cap[1] >= sizeof(*params->he_cap) + 1) params->he_cap = (void *)(cap + 3); + cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, ies, ies_len); + if (cap && cap[1] >= sizeof(*params->he_oper) + 1) + params->he_oper = (void *)(cap + 3); } static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev, @@ -6271,11 +6285,22 @@ static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_MAC]) params.mac = nla_data(info->attrs[NL80211_ATTR_MAC]); - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + switch (dev->ieee80211_ptr->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_P2P_GO: + /* always accept these */ + break; + case NL80211_IFTYPE_ADHOC: + /* conditionally accept */ + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_DEL_IBSS_STA)) + break; + return -EINVAL; + default: return -EINVAL; + } if (!rdev->ops->del_station) return -EOPNOTSUPP; @@ -9306,6 +9331,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, return r; settings->control_port_over_nl80211 = true; + + if (info->attrs[NL80211_ATTR_CONTROL_PORT_NO_PREAUTH]) + settings->control_port_no_preauth = true; } if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) { @@ -10494,6 +10522,15 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) pmksa.pmk_len = nla_len(info->attrs[NL80211_ATTR_PMK]); } + if (info->attrs[NL80211_ATTR_PMK_LIFETIME]) + pmksa.pmk_lifetime = + nla_get_u32(info->attrs[NL80211_ATTR_PMK_LIFETIME]); + + if (info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD]) + pmksa.pmk_reauth_threshold = + nla_get_u8( + info->attrs[NL80211_ATTR_PMK_REAUTH_THRESHOLD]); + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT && !(dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP && @@ -16753,7 +16790,7 @@ void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac, goto nla_put_failure; if ((sta_opmode->changed & STA_OPMODE_MAX_BW_CHANGED) && - nla_put_u8(msg, NL80211_ATTR_CHANNEL_WIDTH, sta_opmode->bw)) + nla_put_u32(msg, NL80211_ATTR_CHANNEL_WIDTH, sta_opmode->bw)) goto nla_put_failure; if ((sta_opmode->changed & STA_OPMODE_N_SS_CHANGED) && diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index c09fbf09549d..63dc8023447f 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -126,6 +126,38 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, "FTM: civic location request not supported"); } + out->ftm.trigger_based = + !!tb[NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED]; + if (out->ftm.trigger_based && !capa->ftm.trigger_based) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED], + "FTM: trigger based ranging is not supported"); + return -EINVAL; + } + + out->ftm.non_trigger_based = + !!tb[NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED]; + if (out->ftm.non_trigger_based && !capa->ftm.non_trigger_based) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED], + "FTM: trigger based ranging is not supported"); + return -EINVAL; + } + + if (out->ftm.trigger_based && out->ftm.non_trigger_based) { + NL_SET_ERR_MSG(info->extack, + "FTM: can't set both trigger based and non trigger based"); + return -EINVAL; + } + + if ((out->ftm.trigger_based || out->ftm.non_trigger_based) && + out->ftm.preamble != NL80211_PREAMBLE_HE) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_PREAMBLE], + "FTM: non EDCA based ranging must use HE preamble"); + return -EINVAL; + } + return 0; } diff --git a/net/wireless/scan.c b/net/wireless/scan.c index dd41e41f9d26..4000382aef48 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -2019,7 +2019,11 @@ void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, spin_lock_bh(&rdev->bss_lock); - if (WARN_ON(cbss->pub.channel == chan)) + /* + * Some APs use CSA also for bandwidth changes, i.e., without actually + * changing the control channel, so no need to update in such a case. + */ + if (cbss->pub.channel == chan) goto done; /* use transmitting bss */ diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 50f567a88f45..6cc7f7f1dd68 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -46,6 +46,25 @@ static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb, pskb_pull(skb, skb->mac_len + x->props.header_len); } +static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, + unsigned int hsize) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + int phlen = 0; + + if (xo->flags & XFRM_GSO_SEGMENT) + skb->transport_header = skb->network_header + hsize; + + skb_reset_mac_len(skb); + if (x->sel.family != AF_INET6) { + phlen = IPV4_BEET_PHMAXLEN; + if (x->outer_mode.family == AF_INET6) + phlen += sizeof(struct ipv6hdr) - sizeof(struct iphdr); + } + + pskb_pull(skb, skb->mac_len + hsize + (x->props.header_len - phlen)); +} + /* Adjust pointers into the packet when IPsec is done at layer2 */ static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb) { @@ -66,9 +85,16 @@ static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb) return __xfrm_transport_prep(x, skb, sizeof(struct ipv6hdr)); break; + case XFRM_MODE_BEET: + if (x->outer_mode.family == AF_INET) + return __xfrm_mode_beet_prep(x, skb, + sizeof(struct iphdr)); + if (x->outer_mode.family == AF_INET6) + return __xfrm_mode_beet_prep(x, skb, + sizeof(struct ipv6hdr)); + break; case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_IN_TRIGGER: - case XFRM_MODE_BEET: break; } } @@ -78,8 +104,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur int err; unsigned long flags; struct xfrm_state *x; - struct sk_buff *skb2, *nskb; struct softnet_data *sd; + struct sk_buff *skb2, *nskb, *pskb = NULL; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); struct sec_path *sp; @@ -168,14 +194,14 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur } else { if (skb == skb2) skb = nskb; - - if (!skb) - return NULL; + else + pskb->next = nskb; continue; } skb_push(skb2, skb2->data - skb_mac_header(skb2)); + pskb = skb2; } return skb; @@ -383,6 +409,7 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void return xfrm_dev_feat_change(dev); case NETDEV_DOWN: + case NETDEV_UNREGISTER: return xfrm_dev_down(dev); } return NOTIFY_DONE; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index fafc7aba705f..2fd3d990d992 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -535,8 +535,8 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb { struct sk_buff *segs, *nskb; - BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); - BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET); + BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET); + BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_GSO_CB_OFFSET); segs = skb_gso_segment(skb, 0); kfree_skb(skb); if (IS_ERR(segs)) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index dbda08ec566e..297b2fdb3c29 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -434,7 +434,9 @@ EXPORT_SYMBOL(xfrm_policy_destroy); static void xfrm_policy_kill(struct xfrm_policy *policy) { + write_lock_bh(&policy->lock); policy->walk.dead = 1; + write_unlock_bh(&policy->lock); atomic_inc(&policy->genid); @@ -2613,7 +2615,6 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, xdst->xfrm_genid = xfrm[i]->genid; dst1->obsolete = DST_OBSOLETE_FORCE_CHK; - dst1->flags |= DST_HOST; dst1->lastuse = now; dst1->input = dst_discard; @@ -2899,7 +2900,7 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, dst_copy_metrics(dst1, dst); dst1->obsolete = DST_OBSOLETE_FORCE_CHK; - dst1->flags |= DST_HOST | DST_XFRM_QUEUE; + dst1->flags |= DST_XFRM_QUEUE; dst1->lastuse = jiffies; dst1->input = dst_discard; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 170d6e7f31d3..8be2d926acc2 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -612,7 +612,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) { struct xfrm_state *x; - x = kmem_cache_alloc(xfrm_state_cache, GFP_ATOMIC | __GFP_ZERO); + x = kmem_cache_zalloc(xfrm_state_cache, GFP_ATOMIC); if (x) { write_pnet(&x->xs_net, net); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index b88ba45ff1ac..e6cfaa680ef3 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -110,7 +110,8 @@ static inline int verify_sec_ctx_len(struct nlattr **attrs) return 0; uctx = nla_data(rt); - if (uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len)) + if (uctx->len > nla_len(rt) || + uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len)) return -EINVAL; return 0; @@ -2275,6 +2276,9 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, err = verify_newpolicy_info(&ua->policy); if (err) goto free_state; + err = verify_sec_ctx_len(attrs); + if (err) + goto free_state; /* build an XP */ xp = xfrm_policy_construct(net, &ua->policy, attrs, &err); |